3 files changed, 163 insertions, 0 deletions
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index e4758d7ab602..d9748a881798 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1074,4 +1074,14 @@ config MLX_CPLD_PLATFORM
 	  This driver handles hot-plug events for the power suppliers, power
 	  cables and fans on the wide range Mellanox IB and Ethernet systems.
 
+config INTEL_TURBO_MAX_3
+	bool "Intel Turbo Boost Max Technology 3.0 enumeration driver"
+	depends on X86_64 && SCHED_MC_PRIO
+	---help---
+	  This driver reads maximum performance ratio of each CPU and set up
+	  the scheduler priority metrics. In this way scheduler can prefer
+	  CPU with higher performance to schedule tasks.
+	  This driver is only required when the system is not using Hardware
+	  P-States (HWP). In HWP mode, priority can be read from ACPI tables.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index d4111f0f8a78..cde7b4fdf5d6 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -76,3 +76,4 @@ obj-$(CONFIG_INTEL_TELEMETRY)	+= intel_telemetry_core.o \
 obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o
 obj-$(CONFIG_MLX_PLATFORM)	+= mlx-platform.o
 obj-$(CONFIG_MLX_CPLD_PLATFORM)	+= mlxcpld-hotplug.o
+obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
diff --git a/drivers/platform/x86/intel_turbo_max_3.c b/drivers/platform/x86/intel_turbo_max_3.c
new file mode 100644
index 000000000000..0103f5b32e34
--- /dev/null
+++ b/drivers/platform/x86/intel_turbo_max_3.c
@@ -0,0 +1,152 @@
+/*
+ * Intel Turbo Boost Max Technology 3.0 legacy (non HWP) enumeration driver
+ * Copyright (c) 2017, Intel Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define MSR_OC_MAILBOX			0x150
+#define MSR_OC_MAILBOX_CMD_OFFSET	32
+#define MSR_OC_MAILBOX_RSP_OFFSET	32
+#define MSR_OC_MAILBOX_BUSY_BIT		63
+#define OC_MAILBOX_FC_CONTROL_CMD	0x1C
+
+/*
+ * Typical latency to get mail box response is ~3us, It takes +3 us to
+ * process reading mailbox after issuing mailbox write on a Broadwell 3.4 GHz
+ * system. So for most of the time, the first mailbox read should have the
+ * response, but to avoid some boundary cases retry twice.
+ */
+#define OC_MAILBOX_RETRY_COUNT		2
+
+static int get_oc_core_priority(unsigned int cpu)
+{
+	u64 value, cmd = OC_MAILBOX_FC_CONTROL_CMD;
+	int ret, i;
+
+	/* Issue favored core read command */
+	value = cmd << MSR_OC_MAILBOX_CMD_OFFSET;
+	/* Set the busy bit to indicate OS is trying to issue command */
+	value |=  BIT_ULL(MSR_OC_MAILBOX_BUSY_BIT);
+	ret = wrmsrl_safe(MSR_OC_MAILBOX, value);
+	if (ret) {
+		pr_debug("cpu %d OC mailbox write failed\n", cpu);
+		return ret;
+	}
+
+	for (i = 0; i < OC_MAILBOX_RETRY_COUNT; ++i) {
+		ret = rdmsrl_safe(MSR_OC_MAILBOX, &value);
+		if (ret) {
+			pr_debug("cpu %d OC mailbox read failed\n", cpu);
+			break;
+		}
+
+		if (value & BIT_ULL(MSR_OC_MAILBOX_BUSY_BIT)) {
+			pr_debug("cpu %d OC mailbox still processing\n", cpu);
+			ret = -EBUSY;
+			continue;
+		}
+
+		if ((value >> MSR_OC_MAILBOX_RSP_OFFSET) & 0xff) {
+			pr_debug("cpu %d OC mailbox cmd failed\n", cpu);
+			ret = -ENXIO;
+			break;
+		}
+
+		ret = value & 0xff;
+		pr_debug("cpu %d max_ratio %d\n", cpu, ret);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * The work item is needed to avoid CPU hotplug locking issues. The function
+ * itmt_legacy_set_priority() is called from CPU online callback, so can't
+ * call sched_set_itmt_support() from there as this function will aquire
+ * hotplug locks in its path.
+ */
+static void itmt_legacy_work_fn(struct work_struct *work)
+{
+	sched_set_itmt_support();
+}
+
+static DECLARE_WORK(sched_itmt_work, itmt_legacy_work_fn);
+
+static int itmt_legacy_cpu_online(unsigned int cpu)
+{
+	static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
+	int priority;
+
+	priority = get_oc_core_priority(cpu);
+	if (priority < 0)
+		return 0;
+
+	sched_set_itmt_core_prio(priority, cpu);
+
+	/* Enable ITMT feature when a core with different priority is found */
+	if (max_highest_perf <= min_highest_perf) {
+		if (priority > max_highest_perf)
+			max_highest_perf = priority;
+
+		if (priority < min_highest_perf)
+			min_highest_perf = priority;
+
+		if (max_highest_perf > min_highest_perf)
+			schedule_work(&sched_itmt_work);
+	}
+
+	return 0;
+}
+
+#define ICPU(model)     { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
+
+static const struct x86_cpu_id itmt_legacy_cpu_ids[] = {
+	ICPU(INTEL_FAM6_BROADWELL_X),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, itmt_legacy_cpu_ids);
+
+static int __init itmt_legacy_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(itmt_legacy_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	if (boot_cpu_has(X86_FEATURE_HWP))
+		return -ENODEV;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				"platform/x86/turbo_max_3:online",
+				itmt_legacy_cpu_online,	NULL);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+late_initcall(itmt_legacy_init)
+
+MODULE_DESCRIPTION("Intel Turbo Boost Max 3.0 enumeration driver");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_LICENSE("GPL v2");