summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-05-05 08:20:52 +0200
committerIngo Molnar <mingo@kernel.org>2017-05-05 08:21:03 +0200
commit415812f2d6b856d33ede9bf4a2b95575dbbb3d4e (patch)
tree4daf9ca52f89ea59453538b406f584d47c77aff3 /arch/x86/kernel/cpu
parent42fc6c6cb1662ba2fa727dd01c9473c63be4e3b6 (diff)
parentd3b5d35290d729a2518af00feca867385a1b08fa (diff)
Merge branch 'linus' into x86/urgent, to pick up dependent commits
We are going to fix a bug introduced by a more recent commit, so refresh the tree. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c59
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c350
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c125
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_schemata.c181
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c397
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c561
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c5
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
15 files changed, 1113 insertions, 629 deletions
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 43955ee6715b..44207b71fee1 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -3,7 +3,7 @@
#include <linux/sched/clock.h>
#include <asm/cpufeature.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1f9e9d..8ee32119144d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,19 +448,60 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
+/* Setup the fixmap mapping only once per-processor */
+static inline void setup_fixmap_gdt(int cpu)
+{
+#ifdef CONFIG_X86_64
+ /* On 64-bit systems, we use a read-only fixmap GDT. */
+ pgprot_t prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the GDT
+ * is read-only, that will triple fault.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor requires
+ * it.
+ */
+ pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+#endif
+
+ __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+}
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
/*
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
*/
void switch_to_new_gdt(int cpu)
{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
/* Reload the per-cpu base */
-
load_percpu_segment(cpu);
}
@@ -1526,6 +1567,9 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#else
@@ -1581,6 +1625,9 @@ void cpu_init(void)
dbg_restore_debug_regs();
fpu__init_cpu();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 063197771b8d..dfa90a3a5145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
return;
}
- if (ring3mwait_disabled) {
- msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
- MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+ if (ring3mwait_disabled)
return;
- }
-
- msr_set_bit(MSR_MISC_FEATURE_ENABLES,
- MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
if (c == &boot_cpu_data)
ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c)
init_intel_energy_perf(c);
}
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_energy_perf(c);
- probe_xeon_phi_r3mwait(c);
+ init_intel_misc_features(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5a533fefefa0..5b366462f579 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -32,55 +32,98 @@
#include <asm/intel-family.h>
#include <asm/intel_rdt.h>
+#define MAX_MBA_BW 100u
+#define MBA_IS_LINEAR 0x4
+
/* Mutex to protect rdtgroup access. */
DEFINE_MUTEX(rdtgroup_mutex);
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+/*
+ * Used to store the max resource name width and max resource data width
+ * to display the schemata in a tabular format
+ */
+int max_name_width, max_data_width;
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
struct rdt_resource rdt_resources_all[] = {
{
- .name = "L3",
- .domains = domain_init(RDT_RESOURCE_L3),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 1,
- .cbm_idx_offset = 0
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ },
+ {
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L3DATA",
- .domains = domain_init(RDT_RESOURCE_L3DATA),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 2,
- .cbm_idx_offset = 0
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L3CODE",
- .domains = domain_init(RDT_RESOURCE_L3CODE),
- .msr_base = IA32_L3_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 3,
- .cbm_idx_multi = 2,
- .cbm_idx_offset = 1
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
},
{
- .name = "L2",
- .domains = domain_init(RDT_RESOURCE_L2),
- .msr_base = IA32_L2_CBM_BASE,
- .min_cbm_bits = 1,
- .cache_level = 2,
- .cbm_idx_multi = 1,
- .cbm_idx_offset = 0
+ .name = "MB",
+ .domains = domain_init(RDT_RESOURCE_MBA),
+ .msr_base = IA32_MBA_THRTL_BASE,
+ .msr_update = mba_wrmsr,
+ .cache_level = 3,
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*d",
},
};
-static int cbm_idx(struct rdt_resource *r, int closid)
+static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
{
- return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+ return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
}
/*
@@ -118,9 +161,9 @@ static inline bool cache_alloc_hsw_probe(void)
return false;
r->num_closid = 4;
- r->cbm_len = 20;
- r->max_cbm = max_cbm;
- r->min_cbm_bits = 2;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.min_cbm_bits = 2;
r->capable = true;
r->enabled = true;
@@ -130,16 +173,66 @@ static inline bool cache_alloc_hsw_probe(void)
return false;
}
-static void rdt_get_config(int idx, struct rdt_resource *r)
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static bool rdt_get_mem_config(struct rdt_resource *r)
+{
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->membw.max_delay = eax.split.max_delay + 1;
+ r->default_ctrl = MAX_MBA_BW;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ }
+ r->data_width = 3;
+ rdt_get_mba_infofile(r);
+
+ r->capable = true;
+ r->enabled = true;
+
+ return true;
+}
+
+static void rdt_get_cache_config(int idx, struct rdt_resource *r)
{
union cpuid_0x10_1_eax eax;
- union cpuid_0x10_1_edx edx;
+ union cpuid_0x10_x_edx edx;
u32 ebx, ecx;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
r->num_closid = edx.split.cos_max + 1;
- r->cbm_len = eax.split.cbm_len + 1;
- r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ rdt_get_cache_infofile(r);
r->capable = true;
r->enabled = true;
}
@@ -150,8 +243,9 @@ static void rdt_get_cdp_l3_config(int type)
struct rdt_resource *r = &rdt_resources_all[type];
r->num_closid = r_l3->num_closid / 2;
- r->cbm_len = r_l3->cbm_len;
- r->max_cbm = r_l3->max_cbm;
+ r->cache.cbm_len = r_l3->cache.cbm_len;
+ r->default_ctrl = r_l3->default_ctrl;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
r->capable = true;
/*
* By default, CDP is disabled. CDP can be enabled by mount parameter
@@ -160,33 +254,6 @@ static void rdt_get_cdp_l3_config(int type)
r->enabled = false;
}
-static inline bool get_rdt_resources(void)
-{
- bool ret = false;
-
- if (cache_alloc_hsw_probe())
- return true;
-
- if (!boot_cpu_has(X86_FEATURE_RDT_A))
- return false;
-
- if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
- rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
- rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
- }
- ret = true;
- }
- if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
- /* CPUID 0x10.2 fields are same format at 0x10.1 */
- rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
- ret = true;
- }
-
- return ret;
-}
-
static int get_cache_id(int cpu, int level)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
@@ -200,29 +267,55 @@ static int get_cache_id(int cpu, int level)
return -1;
}
-void rdt_cbm_update(void *arg)
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
{
- struct msr_param *m = (struct msr_param *)arg;
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return r->default_ctrl;
+}
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+}
+
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct msr_param *m = arg;
struct rdt_resource *r = m->res;
- int i, cpu = smp_processor_id();
+ int cpu = smp_processor_id();
struct rdt_domain *d;
list_for_each_entry(d, &r->domains, list) {
/* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
- goto found;
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ r->msr_update(d, m, r);
+ return;
+ }
}
- pr_info_once("cpu %d not found in any domain for resource %s\n",
+ pr_warn_once("cpu %d not found in any domain for resource %s\n",
cpu, r->name);
-
- return;
-
-found:
- for (i = m->low; i < m->high; i++) {
- int idx = cbm_idx(r, i);
-
- wrmsrl(r->msr_base + idx, d->cbm[i]);
- }
}
/*
@@ -258,6 +351,32 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
return NULL;
}
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct msr_param m;
+ u32 *dc;
+ int i;
+
+ dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ d->ctrl_val = dc;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100
+ */
+ for (i = 0; i < r->num_closid; i++, dc++)
+ *dc = r->default_ctrl;
+
+ m.low = 0;
+ m.high = r->num_closid;
+ r->msr_update(d, &m, r);
+ return 0;
+}
+
/*
* domain_add_cpu - Add a cpu to a resource's domain list.
*
@@ -273,7 +392,7 @@ static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
*/
static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int i, id = get_cache_id(cpu, r->cache_level);
+ int id = get_cache_id(cpu, r->cache_level);
struct list_head *add_pos = NULL;
struct rdt_domain *d;
@@ -294,22 +413,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
d->id = id;
- d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
- if (!d->cbm) {
+ if (domain_setup_ctrlval(r, d)) {
kfree(d);
return;
}
- for (i = 0; i < r->num_closid; i++) {
- int idx = cbm_idx(r, i);
-
- d->cbm[i] = r->max_cbm;
- wrmsrl(r->msr_base + idx, d->cbm[i]);
- }
-
cpumask_set_cpu(cpu, &d->cpu_mask);
list_add_tail(&d->list, add_pos);
- r->num_domains++;
}
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -325,8 +435,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
- r->num_domains--;
- kfree(d->cbm);
+ kfree(d->ctrl_val);
list_del(&d->list);
kfree(d);
}
@@ -374,6 +483,57 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
return 0;
}
+/*
+ * Choose a width for the resource name and resource data based on the
+ * resource that has widest name and cbm.
+ */
+static __init void rdt_init_padding(void)
+{
+ struct rdt_resource *r;
+ int cl;
+
+ for_each_capable_rdt_resource(r) {
+ cl = strlen(r->name);
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ if (r->data_width > max_data_width)
+ max_data_width = r->data_width;
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ bool ret = false;
+
+ if (cache_alloc_hsw_probe())
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+ }
+ ret = true;
+ }
+ if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ ret = true;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ ret = true;
+ }
+
+ return ret;
+}
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -382,6 +542,8 @@ static int __init intel_rdt_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
+ rdt_init_padding();
+
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/rdt/cat:online:",
intel_rdt_online_cpu, intel_rdt_offline_cpu);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9ac2a5cdd9c2..f5af0cc7eb0d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -174,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
.seq_show = rdtgroup_seqfile_show,
};
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+ struct rftype *rft = of->kn->priv;
+
+ return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
static int rdtgroup_cpus_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
@@ -182,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
- else
+ if (rdtgrp) {
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
+ } else {
ret = -ENOENT;
+ }
rdtgroup_kn_unlock(of->kn);
return ret;
@@ -252,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
- ret = cpumask_parse(buf, newmask);
+ if (is_cpu_list(of))
+ ret = cpulist_parse(buf, newmask);
+ else
+ ret = cpumask_parse(buf, newmask);
+
if (ret)
goto unlock;
@@ -473,6 +486,14 @@ static struct rftype rdtgroup_base_files[] = {
.seq_show = rdtgroup_cpus_show,
},
{
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ },
+ {
.name = "tasks",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -494,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
struct rdt_resource *r = of->kn->parent->priv;
seq_printf(seq, "%d\n", r->num_closid);
+ return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%x\n", r->default_ctrl);
return 0;
}
-static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
- seq_printf(seq, "%x\n", r->max_cbm);
+ seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+ return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%u\n", r->membw.min_bw);
return 0;
}
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
- seq_printf(seq, "%d\n", r->min_cbm_bits);
+ seq_printf(seq, "%u\n", r->membw.bw_gran);
+ return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ seq_printf(seq, "%u\n", r->membw.delay_linear);
return 0;
}
/* rdtgroup information files for one cache resource. */
-static struct rftype res_info_files[] = {
+static struct rftype res_cache_info_files[] = {
{
.name = "num_closids",
.mode = 0444,
@@ -530,7 +575,7 @@ static struct rftype res_info_files[] = {
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_cbm_mask_show,
+ .seq_show = rdt_default_ctrl_show,
},
{
.name = "min_cbm_bits",
@@ -540,11 +585,52 @@ static struct rftype res_info_files[] = {
},
};
+/* rdtgroup information files for memory bandwidth. */
+static struct rftype res_mba_info_files[] = {
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ },
+ {
+ .name = "min_bandwidth",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_bw_show,
+ },
+ {
+ .name = "bandwidth_gran",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bw_gran_show,
+ },
+ {
+ .name = "delay_linear",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_delay_linear_show,
+ },
+};
+
+void rdt_get_mba_infofile(struct rdt_resource *r)
+{
+ r->info_files = res_mba_info_files;
+ r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+}
+
+void rdt_get_cache_infofile(struct rdt_resource *r)
+{
+ r->info_files = res_cache_info_files;
+ r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+}
+
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
struct kernfs_node *kn_subdir;
+ struct rftype *res_info_files;
struct rdt_resource *r;
- int ret;
+ int ret, len;
/* create the directory */
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -563,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
goto out_destroy;
- ret = rdtgroup_add_files(kn_subdir, res_info_files,
- ARRAY_SIZE(res_info_files));
+
+ res_info_files = r->info_files;
+ len = r->nr_info_files;
+
+ ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
if (ret)
goto out_destroy;
kernfs_activate(kn_subdir);
@@ -780,7 +869,7 @@ out:
return dentry;
}
-static int reset_all_cbms(struct rdt_resource *r)
+static int reset_all_ctrls(struct rdt_resource *r)
{
struct msr_param msr_param;
cpumask_var_t cpu_mask;
@@ -803,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r)
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
for (i = 0; i < r->num_closid; i++)
- d->cbm[i] = r->max_cbm;
+ d->ctrl_val[i] = r->default_ctrl;
}
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_cbm_update(&msr_param);
+ rdt_ctrl_update(&msr_param);
/* Update CBM on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
put_cpu();
free_cpumask_var(cpu_mask);
@@ -896,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb)
/*Put everything back to default values. */
for_each_enabled_rdt_resource(r)
- reset_all_cbms(r);
+ reset_all_ctrls(r);
cdp_disable();
rmdir_all_sub();
static_branch_disable(&rdt_enable_key);
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
index badd2b31a560..406d7a6532f9 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -29,26 +29,77 @@
#include <asm/intel_rdt.h>
/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+ unsigned long bw;
+ int ret;
+
+ /*
+ * Only linear delay values is supported for current Intel SKUs.
+ */
+ if (!r->membw.delay_linear)
+ return false;
+
+ ret = kstrtoul(buf, 10, &bw);
+ if (ret)
+ return false;
+
+ if (bw < r->membw.min_bw || bw > r->default_ctrl)
+ return false;
+
+ *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+ return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+ unsigned long data;
+
+ if (d->have_new_ctrl)
+ return -EINVAL;
+
+ if (!bw_validate(buf, &data, r))
+ return -EINVAL;
+ d->new_ctrl = data;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
* Check whether a cache bit mask is valid. The SDM says:
* Please note that all (and only) contiguous '1' combinations
* are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
* Additionally Haswell requires at least two bits set.
*/
-static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
{
- unsigned long first_bit, zero_bit;
+ unsigned long first_bit, zero_bit, val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret)
+ return false;
- if (var == 0 || var > r->max_cbm)
+ if (val == 0 || val > r->default_ctrl)
return false;
- first_bit = find_first_bit(&var, r->cbm_len);
- zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+ if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
return false;
- if ((zero_bit - first_bit) < r->min_cbm_bits)
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
return false;
+
+ *data = val;
return true;
}
@@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-static int parse_cbm(char *buf, struct rdt_resource *r)
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
- int ret;
- ret = kstrtoul(buf, 16, &data);
- if (ret)
- return ret;
- if (!cbm_validate(data, r))
+ if (d->have_new_ctrl)
return -EINVAL;
- r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+ if(!cbm_validate(buf, &data, r))
+ return -EINVAL;
+ d->new_ctrl = data;
+ d->have_new_ctrl = true;
return 0;
}
@@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r)
/*
* For each domain in this resource we expect to find a series of:
* id=mask
- * separated by ";". The "id" is in decimal, and must appear in the
- * right order.
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
*/
static int parse_line(char *line, struct rdt_resource *r)
{
@@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r)
struct rdt_domain *d;
unsigned long dom_id;
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ if (!dom || kstrtoul(id, 10, &dom_id))
+ return -EINVAL;
+ dom = strim(dom);
list_for_each_entry(d, &r->domains, list) {
- dom = strsep(&line, ";");
- if (!dom)
- return -EINVAL;
- id = strsep(&dom, "=");
- if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
- return -EINVAL;
- if (parse_cbm(dom, r))
- return -EINVAL;
+ if (d->id == dom_id) {
+ if (r->parse_ctrlval(dom, r, d))
+ return -EINVAL;
+ goto next;
+ }
}
-
- /* Any garbage at the end of the line? */
- if (line && line[0])
- return -EINVAL;
- return 0;
+ return -EINVAL;
}
static int update_domains(struct rdt_resource *r, int closid)
@@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid)
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int cpu, idx = 0;
+ int cpu;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid)
msr_param.res = r;
list_for_each_entry(d, &r->domains, list) {
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
- d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+ if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ d->ctrl_val[closid] = d->new_ctrl;
+ }
}
+ if (cpumask_empty(cpu_mask))
+ goto done;
cpu = get_cpu();
/* Update CBM on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_cbm_update(&msr_param);
+ rdt_ctrl_update(&msr_param);
/* Update CBM on other cpus. */
- smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
put_cpu();
+done:
free_cpumask_var(cpu_mask);
return 0;
}
+static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+{
+ struct rdt_resource *r;
+
+ for_each_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) && closid < r->num_closid)
+ return parse_line(tok, r);
+ }
+ return -EINVAL;
+}
+
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct rdtgroup *rdtgrp;
+ struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
int closid, ret = 0;
- u32 *l3_cbms = NULL;
/* Valid input requires a trailing newline */
if (nbytes == 0 || buf[nbytes - 1] != '\n')
@@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
closid = rdtgrp->closid;
- /* get scratch space to save all the masks while we validate input */
for_each_enabled_rdt_resource(r) {
- r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
- GFP_KERNEL);
- if (!r->tmp_cbms) {
- ret = -ENOMEM;
- goto out;
- }
- r->num_tmp_cbms = 0;
+ list_for_each_entry(dom, &r->domains, list)
+ dom->have_new_ctrl = false;
}
while ((tok = strsep(&buf, "\n")) != NULL) {
- resname = strsep(&tok, ":");
+ resname = strim(strsep(&tok, ":"));
if (!tok) {
ret = -EINVAL;
goto out;
}
- for_each_enabled_rdt_resource(r) {
- if (!strcmp(resname, r->name) &&
- closid < r->num_closid) {
- ret = parse_line(tok, r);
- if (ret)
- goto out;
- break;
- }
- }
- if (!r->name) {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Did the parser find all the masks we need? */
- for_each_enabled_rdt_resource(r) {
- if (r->num_tmp_cbms != r->num_domains) {
- ret = -EINVAL;
+ ret = rdtgroup_parse_resource(resname, tok, closid);
+ if (ret)
goto out;
- }
}
for_each_enabled_rdt_resource(r) {
@@ -200,10 +244,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
}
out:
- for_each_enabled_rdt_resource(r) {
- kfree(r->tmp_cbms);
- r->tmp_cbms = NULL;
- }
rdtgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
struct rdt_domain *dom;
bool sep = false;
- seq_printf(s, "%s:", r->name);
+ seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
seq_puts(s, ";");
- seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+ seq_printf(s, r->format_str, dom->id, max_data_width,
+ dom->ctrl_val[closid]);
sep = true;
}
seq_puts(s, "\n");
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..43051f0777d4 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
+ smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int next, entry;
+
+ wmb();
+ for (;;) {
+ entry = mce_log_get_idx_check(mcelog.next);
+ for (;;) {
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return NOTIFY_OK;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ next = mce_log_get_idx_check(mcelog.next);
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(m, 0, sizeof(*m));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
+ smp_rmb();
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
+ }
+ }
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ return err;
+ }
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 19592ba1a320..654ad0668d72 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
m1->addr != m2->addr ||
m1->misc != m2->misc;
}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void) { }
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index af44ebeb593f..5abd4bf73d6e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
+#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -49,20 +50,11 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/reboot.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-static int mce_chrdev_open_count; /* #times opened */
-
-#define mce_log_get_idx_check(p) \
-({ \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
- !lockdep_is_held(&mce_chrdev_read_mutex), \
- "suspicious mce_log_get_idx_check() usage"); \
- smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -87,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -145,80 +131,36 @@ void mce_setup(struct mce *m)
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
+void mce_log(struct mce *m)
{
- unsigned next, entry;
-
- /* Emit the trace record: */
- trace_mce_record(mce);
-
- if (!mce_gen_pool_add(mce))
+ if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
-
- wmb();
- for (;;) {
- entry = mce_log_get_idx_check(mcelog.next);
- for (;;) {
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- set_bit(0, &mce_need_notify);
}
void mce_inject_log(struct mce *m)
{
- mutex_lock(&mce_chrdev_read_mutex);
+ mutex_lock(&mce_log_mutex);
mce_log(m);
- mutex_unlock(&mce_chrdev_read_mutex);
+ mutex_unlock(&mce_log_mutex);
}
EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers;
void mce_register_decode_chain(struct notifier_block *nb)
{
- atomic_inc(&num_notifiers);
+ if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ return;
- WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
+ atomic_inc(&num_notifiers);
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
@@ -510,7 +452,6 @@ static void mce_schedule_work(void)
static void mce_irq_work_cb(struct irq_work *entry)
{
- mce_notify_irq();
mce_schedule_work();
}
@@ -539,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs)
*/
static int mce_usable_address(struct mce *m)
{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ if (!(m->status & MCI_STATUS_ADDRV))
return 0;
/* Checks after this one are Intel-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 1;
+ if (!(m->status & MCI_STATUS_MISCV))
+ return 0;
+
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
+
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
+
return 1;
}
+static bool memory_error(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
+ } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+ if (!m)
+ return false;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (memory_error(m) &&
+ !(m->status & MCI_STATUS_UC) &&
+ mce_usable_address(m))
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+ return true;
+
+ return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (cec_add_mce(m))
+ return NOTIFY_STOP;
+
+ /* Emit the trace record: */
+ trace_mce_record(m);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+ .notifier_call = mce_first_notifier,
+ .priority = MCE_PRIO_FIRST,
+};
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -582,15 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
- /*
- * Run the default notifier if we have only the SRAO
- * notifier and us registered.
- */
- if (atomic_read(&num_notifiers) > 2)
- return NOTIFY_DONE;
-
- /* Don't print when mcelog is running */
- if (mce_chrdev_open_count > 0)
+ if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
return NOTIFY_DONE;
__print_mce(m);
@@ -643,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i)
}
}
-static bool memory_error(struct mce *m)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
-
- return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
- *
- * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
- * indicating a memory error. Bit 8 is used for indicating a
- * cache hierarchy error. The combination of bit 2 and bit 3
- * is used for indicating a `generic' cache hierarchy error
- * But we can't just blindly check the above bits, because if
- * bit 11 is set, then it is a bus/interconnect error - and
- * either way the above bits just gives more detail on what
- * bus/interconnect error happened. Note that bit 12 can be
- * ignored, as it's the "filter" bit.
- */
- return (m->status & 0xef80) == BIT(7) ||
- (m->status & 0xef00) == BIT(8) ||
- (m->status & 0xeffc) == 0xc;
- }
-
- return false;
-}
-
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -1122,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* on Intel.
*/
int lmce = 1;
+ int cpu = smp_processor_id();
- /* If this CPU is offline, just bail out. */
- if (cpu_is_offline(smp_processor_id())) {
+ /*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -1394,13 +1386,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu));
}
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
/*
* Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI
@@ -1412,11 +1397,7 @@ int mce_notify_irq(void)
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
if (test_and_clear_bit(0, &mce_need_notify)) {
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- if (mce_helper[0])
- schedule_work(&mce_trigger_work);
+ mce_work_trigger();
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
@@ -1683,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
return 0;
}
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
- break;
-
- case X86_VENDOR_AMD: {
+ if (c->x86_vendor == X86_VENDOR_AMD) {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
- /*
- * Install proper ops for Scalable MCA enabled processors
- */
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
msr_ops.status = smca_status_reg;
msr_ops.addr = smca_addr_reg;
msr_ops.misc = smca_misc_reg;
}
- mce_amd_feature_init(c);
+ }
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ mce_adjust_timer = cmci_intel_adjust_timer;
+ break;
+ case X86_VENDOR_AMD: {
+ mce_amd_feature_init(c);
break;
}
@@ -1793,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
machine_check_vector = do_machine_check;
+ __mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
@@ -1818,251 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
}
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_exclu; /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- if (mce_chrdev_open_exclu ||
- (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_chrdev_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- mce_chrdev_open_exclu = 1;
- mce_chrdev_open_count++;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- mce_chrdev_open_count--;
- mce_chrdev_open_exclu = 0;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- /*
- * When ERST is disabled, mce_chrdev_read() should return
- * "no record" instead of "no device."
- */
- if (rc == -ENODEV)
- return 0;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
- size_t usize, loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_chrdev_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = mce_log_get_idx_check(mcelog.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- struct mce *m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(m, 0, sizeof(*m));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, m, sizeof(*m));
- buf += sizeof(*m);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
-
- if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
- err |= copy_to_user(buf, m, sizeof(*m));
- smp_rmb();
- buf += sizeof(*m);
- memset(m, 0, sizeof(*m));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_chrdev_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
- unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
-{
- mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
- return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_chrdev_open,
- .release = mce_chrdev_release,
- .read = mce_chrdev_read,
- .write = mce_chrdev_write,
- .poll = mce_chrdev_poll,
- .unlocked_ioctl = mce_chrdev_ioctl,
- .llseek = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
@@ -2136,6 +1878,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2280,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return size;
}
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
static ssize_t set_ignore_ce(struct device *s,
struct device_attribute *attr,
const char *buf, size_t size)
@@ -2359,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret;
}
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2382,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
&dev_attr_trigger,
+#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
@@ -2556,7 +2277,6 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
- enum cpuhp_state hp_online;
int err;
if (!mce_available(&boot_cpu_data)) {
@@ -2584,21 +2304,11 @@ static __init int mcheck_init_device(void)
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
goto err_out_online;
- hp_online = err;
register_syscore_ops(&mce_syscore_ops);
- /* register character device /dev/mcelog */
- err = misc_register(&mce_chrdev_device);
- if (err)
- goto err_register;
-
return 0;
-err_register:
- unregister_syscore_ops(&mce_syscore_ops);
- cpuhp_remove_state(hp_online);
-
err_out_online:
cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
@@ -2606,7 +2316,7 @@ err_out_mem:
free_cpumask_var(mce_device_initialized);
err_out:
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
return err;
}
@@ -2685,6 +2395,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
+ cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 190b3e6cef4d..e84db79ef272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_XEON_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+
if (rdmsrl_safe(MSR_PPIN_CTL, &val))
return;
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 3b442b64c72d..765afd599039 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -27,7 +27,7 @@
#include <linux/range.h>
#include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
@@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
- return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
}
/**
@@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
WARN_ON(1);
pr_info("update e820 for mtrr\n");
- update_e820();
+ e820__update_table_print();
return 1;
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 24e87e74990d..2bce84d91c2b 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -48,7 +48,7 @@
#include <linux/syscore_ops.h>
#include <asm/cpufeature.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 18ca99f2798b..6df621ae62a7 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
- "wp\t\t: %s\n",
+ "wp\t\t: yes\n",
static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
+ c->cpuid_level);
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d9794060fe22..23c23508c012 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },