diff options
-rw-r--r-- | arch/powerpc/include/asm/mce.h | 1 | ||||
-rw-r--r-- | arch/powerpc/kernel/mce.c | 27 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 43 |
3 files changed, 70 insertions, 1 deletions
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 3276b409299c..a2b8c7b35fba 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -193,5 +193,6 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_process_queued_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt); +extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 1c6d15701c56..c0c52ec1fca7 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt) break; } } + +uint64_t get_mce_fault_addr(struct machine_check_event *evt) +{ + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + if (evt->u.ue_error.effective_address_provided) + return evt->u.ue_error.effective_address; + break; + case MCE_ERROR_TYPE_SLB: + if (evt->u.slb_error.effective_address_provided) + return evt->u.slb_error.effective_address; + break; + case MCE_ERROR_TYPE_ERAT: + if (evt->u.erat_error.effective_address_provided) + return evt->u.erat_error.effective_address; + break; + case MCE_ERROR_TYPE_TLB: + if (evt->u.tlb_error.effective_address_provided) + return evt->u.tlb_error.effective_address; + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + break; + } + return 0; +} +EXPORT_SYMBOL(get_mce_fault_addr); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index f348bd49487c..01e74cbc67e9 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/slab.h> +#include <linux/sched.h> #include <linux/kobject.h> #include <asm/opal.h> #include <asm/firmware.h> @@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) return written; } +static int opal_recover_mce(struct pt_regs *regs, + struct machine_check_event *evt) +{ + int recovered = 0; + uint64_t ea = get_mce_fault_addr(evt); + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + } else if (ea && !is_kernel_addr(ea)) { + /* + * Faulting address is not in kernel text. We should be fine. + * We need to find which process uses this address. + * For now, kill the task if we have received exception when + * in userspace. + * + * TODO: Queue up this address for hwpoisioning later. + */ + if (user_mode(regs) && !is_global_init(current)) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else + recovered = 0; + } else if (user_mode(regs) && !is_global_init(current) && + evt->severity == MCE_SEV_ERROR_SYNC) { + /* + * If we have received a synchronous error when in userspace + * kill the task. + */ + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + return recovered; +} + int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; @@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs) } machine_check_print_event_info(&evt); - return evt.severity == MCE_SEV_FATAL ? 0 : 1; + if (opal_recover_mce(regs, &evt)) + return 1; + return 0; } static irqreturn_t opal_interrupt(int irq, void *data) |