diff options
author | Nicholas Piggin <npiggin@gmail.com> | 2021-05-18 00:03:55 +1000 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2021-06-21 21:13:19 +1000 |
commit | 3729e0ec59a20825bd4c8c70996b2df63915e1dd (patch) | |
tree | 32167c2bd0d15b40db4515e974deec8358dd2bb7 /arch | |
parent | 99cd49bb39516d1beb1c38ae629b15ccb923198c (diff) |
powerpc/powernv: Fix machine check reporting of async store errors
POWER9 and POWER10 asynchronous machine checks due to stores have their
cause reported in SRR1 but SRR1[42] is set, which in other cases
indicates DSISR cause.
Check for these cases and clear SRR1[42], so the cause matching uses
the i-side (SRR1) table.
Fixes: 7b9f71f974a1 ("powerpc/64s: POWER9 machine check handler")
Fixes: 201220bb0e8c ("powerpc/powernv: Machine check handler for POWER10")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210517140355.2325406-1-npiggin@gmail.com
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/kernel/mce_power.c | 48 |
1 files changed, 40 insertions, 8 deletions
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 7b76a9bf324f..c2f55fe7092d 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -481,12 +481,11 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, return -1; } -static int mce_handle_ierror(struct pt_regs *regs, +static int mce_handle_ierror(struct pt_regs *regs, unsigned long srr1, const struct mce_ierror_table table[], struct mce_error_info *mce_err, uint64_t *addr, uint64_t *phys_addr) { - uint64_t srr1 = regs->msr; int handled = 0; int i; @@ -695,19 +694,19 @@ static long mce_handle_ue_error(struct pt_regs *regs, } static long mce_handle_error(struct pt_regs *regs, + unsigned long srr1, const struct mce_derror_table dtable[], const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; uint64_t addr, phys_addr = ULONG_MAX; - uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) handled = mce_handle_derror(regs, dtable, &mce_err, &addr, &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + handled = mce_handle_ierror(regs, srr1, itable, &mce_err, &addr, &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) @@ -723,16 +722,20 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs) /* P7 DD1 leaves top bits of DSISR undefined */ regs->dsisr &= 0x0000ffff; - return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table); + return mce_handle_error(regs, regs->msr, + mce_p7_derror_table, mce_p7_ierror_table); } long __machine_check_early_realmode_p8(struct pt_regs *regs) { - return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table); + return mce_handle_error(regs, regs->msr, + mce_p8_derror_table, mce_p8_ierror_table); } long __machine_check_early_realmode_p9(struct pt_regs *regs) { + unsigned long srr1 = regs->msr; + /* * On POWER9 DD2.1 and below, it's possible to get a machine check * caused by a paste instruction where only DSISR bit 25 is set. This @@ -746,10 +749,39 @@ long __machine_check_early_realmode_p9(struct pt_regs *regs) if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000) return 1; - return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); + /* + * Async machine check due to bad real address from store or foreign + * link time out comes with the load/store bit (PPC bit 42) set in + * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're + * directed to the ierror table so it will find the cause (which + * describes it correctly as a store error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + ((srr1 & 0x081c0000) == 0x08140000 || + (srr1 & 0x081c0000) == 0x08180000)) { + srr1 &= ~PPC_BIT(42); + } + + return mce_handle_error(regs, srr1, + mce_p9_derror_table, mce_p9_ierror_table); } long __machine_check_early_realmode_p10(struct pt_regs *regs) { - return mce_handle_error(regs, mce_p10_derror_table, mce_p10_ierror_table); + unsigned long srr1 = regs->msr; + + /* + * Async machine check due to bad real address from store comes with + * the load/store bit (PPC bit 42) set in SRR1, but the cause comes in + * SRR1 not DSISR. Clear bit 42 so we're directed to the ierror table + * so it will find the cause (which describes it correctly as a store + * error). + */ + if (SRR1_MC_LOADSTORE(srr1) && + (srr1 & 0x081c0000) == 0x08140000) { + srr1 &= ~PPC_BIT(42); + } + + return mce_handle_error(regs, srr1, + mce_p10_derror_table, mce_p10_ierror_table); } |