diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 23 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 60 |
3 files changed, 74 insertions, 13 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fef..10b46906767f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -3,6 +3,8 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, MCE_AO_SEVERITY, @@ -21,7 +23,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg); +int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468b..8bb433043a7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -31,6 +31,7 @@ enum context { IN_KERNEL = 1, IN_USER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; @@ -40,6 +41,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char excp; unsigned char covered; char *msg; } severities[] = { @@ -48,6 +50,8 @@ static struct severity { #define USER .context = IN_USER #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y @@ -62,7 +66,7 @@ static struct severity { ), MCESEV( NO, "Not enabled", - BITCLR(MCI_STATUS_EN) + EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", @@ -71,16 +75,20 @@ static struct severity { /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", - MCGMASK(MCG_STATUS_MCIP, 0) + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), MCESEV( KEEP, "Corrected error", @@ -89,7 +97,7 @@ static struct severity { /* ignore OVER for UCNA */ MCESEV( - KEEP, "Uncorrected no action required", + UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( @@ -178,8 +186,9 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if (s->context && ctx != s->context) continue; + if (s->excp && excp != s->excp) + continue; if (msg) *msg = s->msg; s->covered = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebfd..cfb16f631d52 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i) } } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* + * coming soon + */ + return false; + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; + int severity; int i; this_cpu_inc(mce_poll_count); @@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; + + severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + + /* + * In the cases where we don't have a valid address after all, + * do not add it into the ring buffer. + */ + if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { + if (m.status & MCI_STATUS_ADDRV) { + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_schedule_work(); + } + } + /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg, true) >= + MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -754,7 +801,7 @@ static void mce_reign(void) for_each_possible_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, - &nmsg); + &nmsg, true); if (severity > global_worst) { msg = nmsg; global_worst = severity; @@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, cfg->tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL, true); /* - * When machine check was for corrected handler don't touch, - * unless we're panicing. + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicing. */ - if (severity == MCE_KEEP_SEVERITY && !no_way_out) + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; __set_bit(i, toclear); if (severity == MCE_NO_SEVERITY) { |