From 039d7af651ba414b468bbb22d5d1a76169f81c0d Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 31 Jul 2017 02:06:51 +0800 Subject: EDAC, sb_edac: Classify memory mirroring modes Basically, there are full memory mirroring and address range partial memory mirroring (supported by Haswell EX and Broadwell EX) modes. a) In full memory mirroring, the memory behind each memory controller is mirrored, i.e. the memory is split into two identical mirrors (primary and secondary), half of the memory is reserved for redundancy. b) In address range partial memory mirroring, the memory size (range) of primary and secondary behind each memory controller can be user defined by the TAD0 register. The rest of memory ranges defined by TAD1/TAD2/... in that memory controller are non-mirrored. For more detail on memory mirroring, see the following link written by Tony Luck: https://01.org/lkp/blogs/tonyluck/2016/address-range-partial-memory-mirroring-linux Currently the sb_edac driver only supports address decoding in full memory mirroring and non-mirroring modes. In address range partial memory mirroring mode, it may fail to decode an address that falls in a non-mirroring area (the following was one of this kind of failed logs). mce: Uncorrected hardware memory error in user-access at 566d53a400 Memory failure: 0x566d53a: Killing einj_mem_uc:4647 due to hardware memory corruption Memory failure: 0x566d53a: recovery action for dirty LRU page: Recovered mce: [Hardware Error]: Machine check events logged EDAC sbridge MC1: HANDLING MCE MEMORY ERROR EDAC sbridge MC1: CPU 48: Machine Check Event: 0 Bank 7: ec00000000010090 EDAC sbridge MC1: TSC 4b914aa5a99dab EDAC sbridge MC1: ADDR 566d53a400 EDAC sbridge MC1: MISC 1443a0c86 EDAC sbridge MC1: PROCESSOR 0:406f1 TIME 1499712764 SOCKET 2 APIC 80 EDAC MC1: 0 UE Can't discover the memory rank for ch addr 0x7fb54e900 on any memory ( page:0x0 offset:0x0 grain:32) mce: [Hardware Error]: Machine check events logged Therefore, classify memory mirroring modes and make the address decoding in address range partial memory mode correct. Signed-off-by: Qiuxu Zhuo Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20170730180651.30060-1-qiuxu.zhuo@intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 63 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 687d0f23b9cc..dc0591654011 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -300,6 +300,12 @@ enum domain { SOCK, }; +enum mirroring_mode { + NON_MIRRORING, + ADDR_RANGE_MIRRORING, + FULL_MIRRORING, +}; + struct sbridge_pvt; struct sbridge_info { enum type type; @@ -377,8 +383,9 @@ struct sbridge_pvt { struct sbridge_channel channel[NUM_CHANNELS]; /* Memory type detection */ - bool is_mirrored, is_lockstep, is_close_pg; + bool is_cur_addr_mirrored, is_lockstep, is_close_pg; bool is_chan_hash; + enum mirroring_mode mirror_mode; /* Memory description */ u64 tolm, tohm; @@ -1648,10 +1655,6 @@ static int get_dimm_config(struct mem_ctl_info *mci) enum edac_type mode; u32 reg; - if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) { - pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, ®); - pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21); - } pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt); edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n", pvt->sbridge_dev->mc, @@ -1663,22 +1666,45 @@ static int get_dimm_config(struct mem_ctl_info *mci) */ if (pvt->info.type == KNIGHTS_LANDING) { mode = EDAC_S4ECD4ED; - pvt->is_mirrored = false; + pvt->mirror_mode = NON_MIRRORING; + pvt->is_cur_addr_mirrored = false; if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0) return -1; - pci_read_config_dword(pvt->pci_ta, KNL_MCMTR, &pvt->info.mcmtr); + if (pci_read_config_dword(pvt->pci_ta, KNL_MCMTR, &pvt->info.mcmtr)) { + edac_dbg(0, "Failed to read KNL_MCMTR register\n"); + return -ENODEV; + } } else { - pci_read_config_dword(pvt->pci_ras, RASENABLES, ®); + if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) { + if (pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, ®)) { + edac_dbg(0, "Failed to read HASWELL_HASYSDEFEATURE2 register\n"); + return -ENODEV; + } + pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21); + if (GET_BITFIELD(reg, 28, 28)) { + pvt->mirror_mode = ADDR_RANGE_MIRRORING; + edac_dbg(0, "Address range partial memory mirroring is enabled\n"); + goto next; + } + } + if (pci_read_config_dword(pvt->pci_ras, RASENABLES, ®)) { + edac_dbg(0, "Failed to read RASENABLES register\n"); + return -ENODEV; + } if (IS_MIRROR_ENABLED(reg)) { - edac_dbg(0, "Memory mirror is enabled\n"); - pvt->is_mirrored = true; + pvt->mirror_mode = FULL_MIRRORING; + edac_dbg(0, "Full memory mirroring is enabled\n"); } else { - edac_dbg(0, "Memory mirror is disabled\n"); - pvt->is_mirrored = false; + pvt->mirror_mode = NON_MIRRORING; + edac_dbg(0, "Memory mirroring is disabled\n"); } - pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr); +next: + if (pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr)) { + edac_dbg(0, "Failed to read MCMTR register\n"); + return -ENODEV; + } if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) { edac_dbg(0, "Lockstep is enabled\n"); mode = EDAC_S8ECD8ED; @@ -2092,7 +2118,8 @@ static int get_memory_error_data(struct mem_ctl_info *mci, pci_read_config_dword(pvt->pci_tad[base_ch], tad_ch_nilv_offset[n_tads], &tad_offset); - if (pvt->is_mirrored) { + if (pvt->mirror_mode == FULL_MIRRORING || + (pvt->mirror_mode == ADDR_RANGE_MIRRORING && n_tads == 0)) { *channel_mask |= 1 << ((base_ch + 2) % 4); switch(ch_way) { case 2: @@ -2103,8 +2130,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci, sprintf(msg, "Invalid mirror set. Can't decode addr"); return -EINVAL; } - } else + + pvt->is_cur_addr_mirrored = true; + } else { sck_xch = (1 << sck_way) * ch_way; + pvt->is_cur_addr_mirrored = false; + } if (pvt->is_lockstep) *channel_mask |= 1 << ((base_ch + 1) % 4); @@ -2967,7 +2998,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, * EDAC core should be handling the channel mask, in order to point * to the group of dimm's where the error may be happening. */ - if (!pvt->is_lockstep && !pvt->is_mirrored && !pvt->is_close_pg) + if (!pvt->is_lockstep && !pvt->is_cur_addr_mirrored && !pvt->is_close_pg) channel = first_channel; snprintf(msg, sizeof(msg), -- cgit v1.2.3