diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-ioda.c | 264 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-powernv.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-flash.c | 50 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-lpc.c | 151 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-memory-errors.c | 8 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-takeover.S | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-wrappers.S | 5 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 22 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 26 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 202 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 11 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/powernv.h | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 56 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 25 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/subcore-asm.S | 95 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/subcore.c | 392 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/subcore.h | 18 |
18 files changed, 1078 insertions, 257 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 63cebb9b4d45..4ad0d345bc96 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,7 +1,7 @@ obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o opal-async.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o -obj-y += opal-msglog.o +obj-y += opal-msglog.o subcore.o subcore-asm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 5b51079f3e3b..753f08e36dfa 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -42,11 +42,19 @@ static int ioda_eeh_event(struct notifier_block *nb, { uint64_t changed_evts = (uint64_t)change; - /* We simply send special EEH event */ - if ((changed_evts & OPAL_EVENT_PCI_ERROR) && - (events & OPAL_EVENT_PCI_ERROR) && - eeh_enabled()) + /* + * We simply send special EEH event if EEH has + * been enabled, or clear pending events in + * case that we enable EEH soon + */ + if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || + !(events & OPAL_EVENT_PCI_ERROR)) + return 0; + + if (eeh_enabled()) eeh_send_failure_event(NULL); + else + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); return 0; } @@ -141,7 +149,9 @@ static int ioda_eeh_post_init(struct pci_controller *hose) } #ifdef CONFIG_DEBUG_FS - if (phb->dbgfs) { + if (!phb->has_dbgfs && phb->dbgfs) { + phb->has_dbgfs = 1; + debugfs_create_file("err_injct_outbound", 0600, phb->dbgfs, hose, &ioda_eeh_outb_dbgfs_ops); @@ -154,7 +164,14 @@ static int ioda_eeh_post_init(struct pci_controller *hose) } #endif - phb->eeh_state |= PNV_EEH_STATE_ENABLED; + /* If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; return 0; } @@ -268,6 +285,21 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) return EEH_STATE_NOT_SUPPORT; } + /* + * If we're in middle of PE reset, return normal + * state to keep EEH core going. For PHB reset, we + * still expect to have fenced PHB cleared with + * PHB reset. + */ + if (!(pe->type & EEH_PE_PHB) && + (pe->state & EEH_PE_RESET)) { + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + return result; + } + /* Retrieve PE status through OPAL */ pe_no = pe->addr; ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, @@ -347,52 +379,6 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) return result; } -static int ioda_eeh_pe_clear(struct eeh_pe *pe) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - u32 pe_no; - u8 fstate; - u16 pcierr; - s64 ret; - - pe_no = pe->addr; - hose = pe->phb; - phb = pe->phb->private_data; - - /* Clear the EEH error on the PE */ - ret = opal_pci_eeh_freeze_clear(phb->opal_id, - pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); - if (ret) { - pr_err("%s: Failed to clear EEH error for " - "PHB#%x-PE#%x, err=%lld\n", - __func__, hose->global_number, pe_no, ret); - return -EIO; - } - - /* - * Read the PE state back and verify that the frozen - * state has been removed. - */ - ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, - &fstate, &pcierr, NULL); - if (ret) { - pr_err("%s: Failed to get EEH status on " - "PHB#%x-PE#%x\n, err=%lld\n", - __func__, hose->global_number, pe_no, ret); - return -EIO; - } - - if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) { - pr_err("%s: Frozen state not cleared on " - "PHB#%x-PE#%x, sts=%x\n", - __func__, hose->global_number, pe_no, fstate); - return -EIO; - } - - return 0; -} - static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) { s64 rc = OPAL_HARDWARE; @@ -402,13 +388,16 @@ static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) if (rc <= 0) break; - msleep(rc); + if (system_state < SYSTEM_RUNNING) + udelay(1000 * rc); + else + msleep(rc); } return rc; } -static int ioda_eeh_phb_reset(struct pci_controller *hose, int option) +int ioda_eeh_phb_reset(struct pci_controller *hose, int option) { struct pnv_phb *phb = hose->private_data; s64 rc = OPAL_HARDWARE; @@ -431,9 +420,17 @@ static int ioda_eeh_phb_reset(struct pci_controller *hose, int option) /* * Poll state of the PHB until the request is done - * successfully. + * successfully. The PHB reset is usually PHB complete + * reset followed by hot reset on root bus. So we also + * need the PCI bus settlement delay. */ rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) { + if (system_state < SYSTEM_RUNNING) + udelay(1000 * EEH_PE_RST_SETTLE_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + } out: if (rc != OPAL_SUCCESS) return -EIO; @@ -471,6 +468,8 @@ static int ioda_eeh_root_reset(struct pci_controller *hose, int option) /* Poll state of the PHB until the request is done */ rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) + msleep(EEH_PE_RST_SETTLE_TIME); out: if (rc != OPAL_SUCCESS) return -EIO; @@ -478,32 +477,71 @@ out: return 0; } -static int ioda_eeh_bridge_reset(struct pci_controller *hose, - struct pci_dev *dev, int option) +static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option) + { - u16 ctrl; + struct device_node *dn = pci_device_to_OF_node(dev); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + int aer = edev ? edev->aer_cap : 0; + u32 ctrl; - pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n", - __func__, hose->global_number, dev->bus->number, - PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option); + pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(dev->bus), + dev->bus->number, option); switch (option) { case EEH_RESET_FUNDAMENTAL: case EEH_RESET_HOT: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + /* Don't report linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl |= PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_HOLD_TIME); + break; case EEH_RESET_DEACTIVATE: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_SETTLE_TIME); + + /* Continue reporting linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl &= ~PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + break; } return 0; } +void pnv_pci_reset_secondary_bus(struct pci_dev *dev) +{ + struct pci_controller *hose; + + if (pci_is_root_bus(dev->bus)) { + hose = pci_bus_to_host(dev->bus); + ioda_eeh_root_reset(hose, EEH_RESET_HOT); + ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); + } else { + ioda_eeh_bridge_reset(dev, EEH_RESET_HOT); + ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + } +} + /** * ioda_eeh_reset - Reset the indicated PE * @pe: EEH PE @@ -523,27 +561,18 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option) int ret; /* - * Anyway, we have to clear the problematic state for the - * corresponding PE. However, we needn't do it if the PE - * is PHB associated. That means the PHB is having fatal - * errors and it needs reset. Further more, the AIB interface - * isn't reliable any more. - */ - if (!(pe->type & EEH_PE_PHB) && - (option == EEH_RESET_HOT || - option == EEH_RESET_FUNDAMENTAL)) { - ret = ioda_eeh_pe_clear(pe); - if (ret) - return -EIO; - } - - /* - * The rules applied to reset, either fundamental or hot reset: + * For PHB reset, we always have complete reset. For those PEs whose + * primary bus derived from root complex (root bus) or root port + * (usually bus#1), we apply hot or fundamental reset on the root port. + * For other PEs, we always have hot reset on the PE primary bus. * - * We always reset the direct upstream bridge of the PE. If the - * direct upstream bridge isn't root bridge, we always take hot - * reset no matter what option (fundamental or hot) is. Otherwise, - * we should do the reset according to the required option. + * Here, we have different design to pHyp, which always clear the + * frozen state during PE reset. However, the good idea here from + * benh is to keep frozen state before we get PE reset done completely + * (until BAR restore). With the frozen state, HW drops illegal IO + * or MMIO access, which can incur recrusive frozen PE during PE + * reset. The side effect is that EEH core has to clear the frozen + * state explicitly after BAR restore. */ if (pe->type & EEH_PE_PHB) { ret = ioda_eeh_phb_reset(hose, option); @@ -553,7 +582,7 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option) pci_is_root_bus(bus->parent)) ret = ioda_eeh_root_reset(hose, option); else - ret = ioda_eeh_bridge_reset(hose, bus->self, option); + ret = ioda_eeh_bridge_reset(bus->self, option); } return ret; @@ -640,22 +669,6 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose) } } -static int ioda_eeh_get_phb_pe(struct pci_controller *hose, - struct eeh_pe **pe) -{ - struct eeh_pe *phb_pe; - - phb_pe = eeh_phb_pe_get(hose); - if (!phb_pe) { - pr_warning("%s Can't find PE for PHB#%d\n", - __func__, hose->global_number); - return -EEXIST; - } - - *pe = phb_pe; - return 0; -} - static int ioda_eeh_get_pe(struct pci_controller *hose, u16 pe_no, struct eeh_pe **pe) { @@ -663,7 +676,8 @@ static int ioda_eeh_get_pe(struct pci_controller *hose, struct eeh_dev dev; /* Find the PHB PE */ - if (ioda_eeh_get_phb_pe(hose, &phb_pe)) + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) return -EEXIST; /* Find the PE according to PE# */ @@ -691,6 +705,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) { struct pci_controller *hose; struct pnv_phb *phb; + struct eeh_pe *phb_pe; u64 frozen_pe_no; u16 err_type, severity; long rc; @@ -707,10 +722,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) list_for_each_entry(hose, &hose_list, list_node) { /* * If the subordinate PCI buses of the PHB has been - * removed, we needn't take care of it any more. + * removed or is exactly under error recovery, we + * needn't take care of it any more. */ phb = hose->private_data; - if (phb->eeh_state & PNV_EEH_STATE_REMOVED) + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) continue; rc = opal_pci_next_error(phb->opal_id, @@ -743,12 +760,6 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) switch (err_type) { case OPAL_EEH_IOC_ERROR: if (severity == OPAL_EEH_SEV_IOC_DEAD) { - list_for_each_entry(hose, &hose_list, - list_node) { - phb = hose->private_data; - phb->eeh_state |= PNV_EEH_STATE_REMOVED; - } - pr_err("EEH: dead IOC detected\n"); ret = EEH_NEXT_ERR_DEAD_IOC; } else if (severity == OPAL_EEH_SEV_INF) { @@ -761,17 +772,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) break; case OPAL_EEH_PHB_ERROR: if (severity == OPAL_EEH_SEV_PHB_DEAD) { - if (ioda_eeh_get_phb_pe(hose, pe)) - break; - + *pe = phb_pe; pr_err("EEH: dead PHB#%x detected\n", hose->global_number); - phb->eeh_state |= PNV_EEH_STATE_REMOVED; ret = EEH_NEXT_ERR_DEAD_PHB; } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { - if (ioda_eeh_get_phb_pe(hose, pe)) - break; - + *pe = phb_pe; pr_err("EEH: fenced PHB#%x detected\n", hose->global_number); ret = EEH_NEXT_ERR_FENCED_PHB; @@ -789,17 +795,21 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) * If we can't find the corresponding PE, the * PEEV / PEST would be messy. So we force an * fenced PHB so that it can be recovered. + * + * If the PE has been marked as isolated, that + * should have been removed permanently or in + * progress with recovery. We needn't report + * it again. */ if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) { - if (!ioda_eeh_get_phb_pe(hose, pe)) { - pr_err("EEH: Escalated fenced PHB#%x " - "detected for PE#%llx\n", - hose->global_number, - frozen_pe_no); - ret = EEH_NEXT_ERR_FENCED_PHB; - } else { - ret = EEH_NEXT_ERR_NONE; - } + *pe = phb_pe; + pr_err("EEH: Escalated fenced PHB#%x " + "detected for PE#%llx\n", + hose->global_number, + frozen_pe_no); + ret = EEH_NEXT_ERR_FENCED_PHB; + } else if ((*pe)->state & EEH_PE_ISOLATED) { + ret = EEH_NEXT_ERR_NONE; } else { pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", (*pe)->addr, (*pe)->phb->global_number); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index a59788e83b8b..56a206f32f77 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -126,6 +126,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) edev->mode &= 0xFFFFFF00; if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) edev->mode |= EEH_DEV_BRIDGE; + edev->pcix_cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (pci_is_pcie(dev)) { edev->pcie_cap = pci_pcie_cap(dev); @@ -133,6 +134,9 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) edev->mode |= EEH_DEV_ROOT_PORT; else if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM) edev->mode |= EEH_DEV_DS_PORT; + + edev->aer_cap = pci_find_ext_capability(dev, + PCI_EXT_CAP_ID_ERR); } edev->config_addr = ((dev->bus->number << 8) | dev->devfn); diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index dc487ff04704..5c21d9c07f45 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include <linux/delay.h> #include <asm/opal.h> @@ -130,7 +131,8 @@ static inline void opal_flash_validate(void) { long ret; void *buf = validate_flash_data.buf; - __be32 size, result; + __be32 size = cpu_to_be32(validate_flash_data.buf_size); + __be32 result; ret = opal_validate_flash(__pa(buf), &size, &result); @@ -290,11 +292,6 @@ static int opal_flash_update(int op) /* First entry address */ addr = __pa(list); - pr_alert("FLASH: Image is %u bytes\n", image_data.size); - pr_alert("FLASH: Image update requested\n"); - pr_alert("FLASH: Image will be updated during system reboot\n"); - pr_alert("FLASH: This will take several minutes. Do not power off!\n"); - flash: rc = opal_update_flash(addr); @@ -302,6 +299,47 @@ invalid_img: return rc; } +/* Return CPUs to OPAL before starting FW update */ +static void flash_return_cpu(void *info) +{ + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + /* Disable IRQ */ + hard_irq_disable(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); +} + +/* This gets called just before system reboots */ +void opal_flash_term_callback(void) +{ + struct cpumask mask; + + if (update_flash_data.status != FLASH_IMG_READY) + return; + + pr_alert("FLASH: Flashing new firmware\n"); + pr_alert("FLASH: Image is %u bytes\n", image_data.size); + pr_alert("FLASH: Performing flash and reboot/shutdown\n"); + pr_alert("FLASH: This will take several minutes. Do not power off!\n"); + + /* Small delay to help getting the above message out */ + msleep(500); + + /* Return secondary CPUs to firmware */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + if (!cpumask_empty(&mask)) + smp_call_function_many(&mask, + flash_return_cpu, NULL, false); + /* Hard disable interrupts */ + hard_irq_disable(); +} + /* * Show candidate image status */ diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 79d83cad3d67..f04b4d8aca5a 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -12,12 +12,17 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/slab.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/xics.h> #include <asm/opal.h> #include <asm/prom.h> +#include <asm/uaccess.h> +#include <asm/debug.h> static int opal_lpc_chip_id = -1; @@ -176,6 +181,152 @@ static const struct ppc_pci_io opal_lpc_io = { .outsl = opal_lpc_outsl, }; +#ifdef CONFIG_DEBUG_FS +struct lpc_debugfs_entry { + enum OpalLPCAddressType lpc_type; +}; + +static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_WRITE, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos, + &data, len); + if (rc) + return -ENXIO; + switch(len) { + case 4: + rc = __put_user((u32)data, (u32 __user *)ubuf); + break; + case 2: + rc = __put_user((u16)data, (u16 __user *)ubuf); + break; + default: + rc = __put_user((u8)data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_READ, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + switch(len) { + case 4: + rc = __get_user(data, (u32 __user *)ubuf); + break; + case 2: + rc = __get_user(data, (u16 __user *)ubuf); + break; + default: + rc = __get_user(data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + + rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos, + data, len); + if (rc) + return -ENXIO; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static const struct file_operations lpc_fops = { + .read = lpc_debug_read, + .write = lpc_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int opal_lpc_debugfs_create_type(struct dentry *folder, + const char *fname, + enum OpalLPCAddressType type) +{ + struct lpc_debugfs_entry *entry; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->lpc_type = type; + debugfs_create_file(fname, 0600, folder, entry, &lpc_fops); + return 0; +} + +static int opal_lpc_init_debugfs(void) +{ + struct dentry *root; + int rc = 0; + + if (opal_lpc_chip_id < 0) + return -ENODEV; + + root = debugfs_create_dir("lpc", powerpc_debugfs_root); + + rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); + rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); + rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW); + return rc; +} +device_initcall(opal_lpc_init_debugfs); +#endif /* CONFIG_DEBUG_FS */ + void opal_lpc_init(void) { struct device_node *np; diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index ec4132239cdf..b17a34b695ef 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -47,12 +47,12 @@ static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) __func__, merr_evt->type); switch (merr_evt->type) { case OPAL_MEM_ERR_TYPE_RESILIENCE: - paddr_start = merr_evt->u.resilience.physical_address_start; - paddr_end = merr_evt->u.resilience.physical_address_end; + paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end); break; case OPAL_MEM_ERR_TYPE_DYN_DALLOC: - paddr_start = merr_evt->u.dyn_dealloc.physical_address_start; - paddr_end = merr_evt->u.dyn_dealloc.physical_address_end; + paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end); break; default: return; diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S index 3cd262897c27..11a3169ee583 100644 --- a/arch/powerpc/platforms/powernv/opal-takeover.S +++ b/arch/powerpc/platforms/powernv/opal-takeover.S @@ -21,11 +21,13 @@ _GLOBAL(opal_query_takeover) mfcr r0 stw r0,8(r1) + stdu r1,-STACKFRAMESIZE(r1) std r3,STK_PARAM(R3)(r1) std r4,STK_PARAM(R4)(r1) li r3,H_HAL_TAKEOVER li r4,H_HAL_TAKEOVER_QUERY_MAGIC HVSC + addi r1,r1,STACKFRAMESIZE ld r10,STK_PARAM(R3)(r1) std r4,0(r10) ld r10,STK_PARAM(R4)(r1) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f531ffe35b3e..4abbff22a61f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -32,7 +32,7 @@ std r12,PACASAVEDMSR(r13); \ andc r12,r12,r0; \ mtmsrd r12,1; \ - LOAD_REG_ADDR(r0,.opal_return); \ + LOAD_REG_ADDR(r0,opal_return); \ mtlr r0; \ li r0,MSR_DR|MSR_IR|MSR_LE;\ andc r12,r12,r0; \ @@ -44,7 +44,7 @@ mtspr SPRN_HSRR0,r12; \ hrfid -_STATIC(opal_return) +opal_return: /* * Fixup endian on OPAL return... we should be able to simplify * this by instead converting the below trampoline to a set of @@ -124,6 +124,7 @@ OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index f343183add07..199975613fe9 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -57,6 +57,21 @@ static DEFINE_SPINLOCK(opal_notifier_lock); static uint64_t last_notified_mask = 0x0ul; static atomic_t opal_notifier_hold = ATOMIC_INIT(0); +static void opal_reinit_cores(void) +{ + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... + * + * It will preserve non volatile GPRs and HSPRG0/1. It will + * also restore HIDs and other SPRs to their original value + * but it might clobber a bunch. + */ +#ifdef __BIG_ENDIAN__ + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); +#else + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); +#endif +} + int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) { @@ -96,6 +111,13 @@ int __init early_init_dt_scan_opal(unsigned long node, printk("OPAL V1 detected !\n"); } + /* Reinit all cores with the right endian */ + opal_reinit_cores(); + + /* Restore some bits */ + if (cur_cpu_spec->cpu_restore) + cur_cpu_spec->cpu_restore(); + return 1; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 98824aa99173..de19edeaa7a7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/pci.h> +#include <linux/crash_dump.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> @@ -663,15 +664,15 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, * errors, and on the first pass the data will be a relative * bus number, print that out instead. */ - tbl->it_busno = 0; pe->tce_inval_reg_phys = be64_to_cpup(swinvp); tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 8); - tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | - TCE_PCI_SWINV_PAIR; + tbl->it_type |= (TCE_PCI_SWINV_CREATE | + TCE_PCI_SWINV_FREE | + TCE_PCI_SWINV_PAIR); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); if (pe->pdev) set_iommu_table_base_and_group(&pe->pdev->dev, tbl); @@ -793,14 +794,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, * errors, and on the first pass the data will be a relative * bus number, print that out instead. */ - tbl->it_busno = 0; pe->tce_inval_reg_phys = be64_to_cpup(swinvp); tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 8); - tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); if (pe->pdev) set_iommu_table_base_and_group(&pe->pdev->dev, tbl); @@ -1386,12 +1386,24 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; + ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus; pci_add_flags(PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); if (rc) pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + + /* If we're running in kdump kerenl, the previous kerenl never + * shutdown PCI devices correctly. We already got IODA table + * cleaned out. So we have to issue PHB reset to stop all PCI + * transactions from previous kerenl. + */ + if (is_kdump_kernel()) { + pr_info(" Issue PHB reset ...\n"); + ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); + ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET); + } } void __init pnv_pci_init_ioda2_phb(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 8518817dcdfd..eefbfcc3fd8c 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -131,65 +131,60 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, int i; data = (struct OpalIoP7IOCPhbErrorData *)common; - pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n\n", + pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n", hose->global_number, common->version); if (data->brdgCtl) - pr_info(" brdgCtl: %08x\n", + pr_info("brdgCtl: %08x\n", data->brdgCtl); if (data->portStatusReg || data->rootCmplxStatus || data->busAgentStatus) - pr_info(" UtlSts: %08x %08x %08x\n", + pr_info("UtlSts: %08x %08x %08x\n", data->portStatusReg, data->rootCmplxStatus, data->busAgentStatus); if (data->deviceStatus || data->slotStatus || data->linkStatus || data->devCmdStatus || data->devSecStatus) - pr_info(" RootSts: %08x %08x %08x %08x %08x\n", + pr_info("RootSts: %08x %08x %08x %08x %08x\n", data->deviceStatus, data->slotStatus, data->linkStatus, data->devCmdStatus, data->devSecStatus); if (data->rootErrorStatus || data->uncorrErrorStatus || data->corrErrorStatus) - pr_info(" RootErrSts: %08x %08x %08x\n", + pr_info("RootErrSts: %08x %08x %08x\n", data->rootErrorStatus, data->uncorrErrorStatus, data->corrErrorStatus); if (data->tlpHdr1 || data->tlpHdr2 || data->tlpHdr3 || data->tlpHdr4) - pr_info(" RootErrLog: %08x %08x %08x %08x\n", + pr_info("RootErrLog: %08x %08x %08x %08x\n", data->tlpHdr1, data->tlpHdr2, data->tlpHdr3, data->tlpHdr4); if (data->sourceId || data->errorClass || data->correlator) - pr_info(" RootErrLog1: %08x %016llx %016llx\n", + pr_info("RootErrLog1: %08x %016llx %016llx\n", data->sourceId, data->errorClass, data->correlator); if (data->p7iocPlssr || data->p7iocCsr) - pr_info(" PhbSts: %016llx %016llx\n", + pr_info("PhbSts: %016llx %016llx\n", data->p7iocPlssr, data->p7iocCsr); - if (data->lemFir || data->lemErrorMask || - data->lemWOF) - pr_info(" Lem: %016llx %016llx %016llx\n", + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", data->lemFir, data->lemErrorMask, data->lemWOF); - if (data->phbErrorStatus || data->phbFirstErrorStatus || - data->phbErrorLog0 || data->phbErrorLog1) - pr_info(" PhbErr: %016llx %016llx %016llx %016llx\n", + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", data->phbErrorStatus, data->phbFirstErrorStatus, data->phbErrorLog0, data->phbErrorLog1); - if (data->mmioErrorStatus || data->mmioFirstErrorStatus || - data->mmioErrorLog0 || data->mmioErrorLog1) - pr_info(" OutErr: %016llx %016llx %016llx %016llx\n", + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", data->mmioErrorStatus, data->mmioFirstErrorStatus, data->mmioErrorLog0, data->mmioErrorLog1); - if (data->dma0ErrorStatus || data->dma0FirstErrorStatus || - data->dma0ErrorLog0 || data->dma0ErrorLog1) - pr_info(" InAErr: %016llx %016llx %016llx %016llx\n", + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", data->dma0ErrorStatus, data->dma0FirstErrorStatus, data->dma0ErrorLog0, data->dma0ErrorLog1); - if (data->dma1ErrorStatus || data->dma1FirstErrorStatus || - data->dma1ErrorLog0 || data->dma1ErrorLog1) - pr_info(" InBErr: %016llx %016llx %016llx %016llx\n", + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", data->dma1ErrorStatus, data->dma1FirstErrorStatus, data->dma1ErrorLog0, data->dma1ErrorLog1); @@ -198,7 +193,7 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, (data->pestB[i] >> 63) == 0) continue; - pr_info(" PE[%3d] A/B: %016llx %016llx\n", + pr_info("PE[%3d] A/B: %016llx %016llx\n", i, data->pestA[i], data->pestB[i]); } } @@ -210,69 +205,63 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, int i; data = (struct OpalIoPhb3ErrorData*)common; - pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n\n", + pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n", hose->global_number, common->version); if (data->brdgCtl) - pr_info(" brdgCtl: %08x\n", + pr_info("brdgCtl: %08x\n", data->brdgCtl); if (data->portStatusReg || data->rootCmplxStatus || data->busAgentStatus) - pr_info(" UtlSts: %08x %08x %08x\n", + pr_info("UtlSts: %08x %08x %08x\n", data->portStatusReg, data->rootCmplxStatus, data->busAgentStatus); if (data->deviceStatus || data->slotStatus || data->linkStatus || data->devCmdStatus || data->devSecStatus) - pr_info(" RootSts: %08x %08x %08x %08x %08x\n", + pr_info("RootSts: %08x %08x %08x %08x %08x\n", data->deviceStatus, data->slotStatus, data->linkStatus, data->devCmdStatus, data->devSecStatus); if (data->rootErrorStatus || data->uncorrErrorStatus || data->corrErrorStatus) - pr_info(" RootErrSts: %08x %08x %08x\n", + pr_info("RootErrSts: %08x %08x %08x\n", data->rootErrorStatus, data->uncorrErrorStatus, data->corrErrorStatus); if (data->tlpHdr1 || data->tlpHdr2 || data->tlpHdr3 || data->tlpHdr4) - pr_info(" RootErrLog: %08x %08x %08x %08x\n", + pr_info("RootErrLog: %08x %08x %08x %08x\n", data->tlpHdr1, data->tlpHdr2, data->tlpHdr3, data->tlpHdr4); if (data->sourceId || data->errorClass || data->correlator) - pr_info(" RootErrLog1: %08x %016llx %016llx\n", + pr_info("RootErrLog1: %08x %016llx %016llx\n", data->sourceId, data->errorClass, data->correlator); - if (data->nFir || data->nFirMask || - data->nFirWOF) - pr_info(" nFir: %016llx %016llx %016llx\n", + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", data->nFir, data->nFirMask, data->nFirWOF); if (data->phbPlssr || data->phbCsr) - pr_info(" PhbSts: %016llx %016llx\n", + pr_info("PhbSts: %016llx %016llx\n", data->phbPlssr, data->phbCsr); - if (data->lemFir || data->lemErrorMask || - data->lemWOF) - pr_info(" Lem: %016llx %016llx %016llx\n", + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", data->lemFir, data->lemErrorMask, data->lemWOF); - if (data->phbErrorStatus || data->phbFirstErrorStatus || - data->phbErrorLog0 || data->phbErrorLog1) - pr_info(" PhbErr: %016llx %016llx %016llx %016llx\n", + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", data->phbErrorStatus, data->phbFirstErrorStatus, data->phbErrorLog0, data->phbErrorLog1); - if (data->mmioErrorStatus || data->mmioFirstErrorStatus || - data->mmioErrorLog0 || data->mmioErrorLog1) - pr_info(" OutErr: %016llx %016llx %016llx %016llx\n", + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", data->mmioErrorStatus, data->mmioFirstErrorStatus, data->mmioErrorLog0, data->mmioErrorLog1); - if (data->dma0ErrorStatus || data->dma0FirstErrorStatus || - data->dma0ErrorLog0 || data->dma0ErrorLog1) - pr_info(" InAErr: %016llx %016llx %016llx %016llx\n", + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", data->dma0ErrorStatus, data->dma0FirstErrorStatus, data->dma0ErrorLog0, data->dma0ErrorLog1); - if (data->dma1ErrorStatus || data->dma1FirstErrorStatus || - data->dma1ErrorLog0 || data->dma1ErrorLog1) - pr_info(" InBErr: %016llx %016llx %016llx %016llx\n", + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", data->dma1ErrorStatus, data->dma1FirstErrorStatus, data->dma1ErrorLog0, data->dma1ErrorLog1); @@ -281,7 +270,7 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, (data->pestB[i] >> 63) == 0) continue; - pr_info(" PE[%3d] A/B: %016llx %016llx\n", + pr_info("PE[%3d] A/B: %016llx %016llx\n", i, data->pestA[i], data->pestB[i]); } } @@ -384,9 +373,6 @@ int pnv_pci_cfg_read(struct device_node *dn, struct pci_dn *pdn = PCI_DN(dn); struct pnv_phb *phb = pdn->phb->private_data; u32 bdfn = (pdn->busno << 8) | pdn->devfn; -#ifdef CONFIG_EEH - struct eeh_pe *phb_pe = NULL; -#endif s64 rc; switch (size) { @@ -412,31 +398,9 @@ int pnv_pci_cfg_read(struct device_node *dn, default: return PCIBIOS_FUNC_NOT_SUPPORTED; } + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", __func__, pdn->busno, pdn->devfn, where, size, *val); - - /* - * Check if the specified PE has been put into frozen - * state. On the other hand, we needn't do that while - * the PHB has been put into frozen state because of - * PHB-fatal errors. - */ -#ifdef CONFIG_EEH - phb_pe = eeh_phb_pe_get(pdn->phb); - if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED)) - return PCIBIOS_SUCCESSFUL; - - if (phb->eeh_state & PNV_EEH_STATE_ENABLED) { - if (*val == EEH_IO_ERROR_VALUE(size) && - eeh_dev_check_failure(of_node_to_eeh_dev(dn))) - return PCIBIOS_DEVICE_NOT_FOUND; - } else { - pnv_pci_config_check_eeh(phb, dn); - } -#else - pnv_pci_config_check_eeh(phb, dn); -#endif - return PCIBIOS_SUCCESSFUL; } @@ -463,33 +427,74 @@ int pnv_pci_cfg_write(struct device_node *dn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - /* Check if the PHB got frozen due to an error (no response) */ -#ifdef CONFIG_EEH - if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED)) - pnv_pci_config_check_eeh(phb, dn); -#else - pnv_pci_config_check_eeh(phb, dn); -#endif - return PCIBIOS_SUCCESSFUL; } +#if CONFIG_EEH +static bool pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + struct eeh_dev *edev = NULL; + struct pnv_phb *phb = hose->private_data; + + /* EEH not enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + return true; + + /* PE reset or device removed ? */ + edev = of_node_to_eeh_dev(dn); + if (edev) { + if (edev->pe && + (edev->pe->state & EEH_PE_RESET)) + return false; + + if (edev->mode & EEH_DEV_REMOVED) + return false; + } + + return true; +} +#else +static inline pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + return true; +} +#endif /* CONFIG_EEH */ + static int pnv_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; + *val = 0xFFFFFFFF; for (dn = busdn->child; dn; dn = dn->sibling) { pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn) - return pnv_pci_cfg_read(dn, where, size, val); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - *val = 0xFFFFFFFF; - return PCIBIOS_DEVICE_NOT_FOUND; + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + ret = pnv_pci_cfg_read(dn, where, size, val); + if (phb->flags & PNV_PHB_FLAG_EEH) { + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + } else { + pnv_pci_config_check_eeh(phb, dn); + } + + return ret; } static int pnv_pci_write_config(struct pci_bus *bus, @@ -498,14 +503,27 @@ static int pnv_pci_write_config(struct pci_bus *bus, { struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; for (dn = busdn->child; dn; dn = dn->sibling) { pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn) - return pnv_pci_cfg_write(dn, where, size, val); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - return PCIBIOS_DEVICE_NOT_FOUND; + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_write(dn, where, size, val); + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + pnv_pci_config_check_eeh(phb, dn); + + return ret; } struct pci_ops pnv_pci_ops = { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index cde169442775..676232c34328 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -81,28 +81,27 @@ struct pnv_eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*next_error)(struct eeh_pe **pe); }; - -#define PNV_EEH_STATE_ENABLED (1 << 0) /* EEH enabled */ -#define PNV_EEH_STATE_REMOVED (1 << 1) /* PHB removed */ - #endif /* CONFIG_EEH */ +#define PNV_PHB_FLAG_EEH (1 << 0) + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; enum pnv_phb_model model; u64 hub_id; u64 opal_id; + int flags; void __iomem *regs; int initialized; spinlock_t lock; #ifdef CONFIG_EEH struct pnv_eeh_ops *eeh_ops; - int eeh_state; #endif #ifdef CONFIG_DEBUG_FS + int has_dbgfs; struct dentry *dbgfs; #endif @@ -205,5 +204,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm); +extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); +extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 0051e108ef0f..75501bfede7f 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -25,4 +25,6 @@ static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) extern void pnv_lpc_init(void); +bool cpu_core_split_required(void); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 8723d32632f5..8c16a5f96728 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -27,6 +27,7 @@ #include <linux/interrupt.h> #include <linux/bug.h> #include <linux/pci.h> +#include <linux/cpufreq.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -98,11 +99,32 @@ static void pnv_show_cpuinfo(struct seq_file *m) of_node_put(root); } +static void pnv_prepare_going_down(void) +{ + /* + * Disable all notifiers from OPAL, we can't + * service interrupts anymore anyway + */ + opal_notifier_disable(); + + /* Soft disable interrupts */ + local_irq_disable(); + + /* + * Return secondary CPUs to firwmare if a flash update + * is pending otherwise we will get all sort of error + * messages about CPU being stuck etc.. This will also + * have the side effect of hard disabling interrupts so + * past this point, the kernel is effectively dead. + */ + opal_flash_term_callback(); +} + static void __noreturn pnv_restart(char *cmd) { long rc = OPAL_BUSY; - opal_notifier_disable(); + pnv_prepare_going_down(); while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_reboot(); @@ -119,7 +141,7 @@ static void __noreturn pnv_power_off(void) { long rc = OPAL_BUSY; - opal_notifier_disable(); + pnv_prepare_going_down(); while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_power_down(0); @@ -222,6 +244,13 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) } #endif /* CONFIG_KEXEC */ +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static unsigned long pnv_memory_block_size(void) +{ + return 256UL * 1024 * 1024; +} +#endif + static void __init pnv_setup_machdep_opal(void) { ppc_md.get_boot_time = opal_get_boot_time; @@ -269,6 +298,25 @@ static int __init pnv_probe(void) return 1; } +/* + * Returns the cpu frequency for 'cpu' in Hz. This is used by + * /proc/cpuinfo + */ +unsigned long pnv_get_proc_freq(unsigned int cpu) +{ + unsigned long ret_freq; + + ret_freq = cpufreq_quick_get(cpu) * 1000ul; + + /* + * If the backend cpufreq driver does not exist, + * then fallback to old way of reporting the clockrate. + */ + if (!ret_freq) + ret_freq = ppc_proc_freq; + return ret_freq; +} + define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, @@ -276,6 +324,7 @@ define_machine(powernv) { .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, + .get_proc_freq = pnv_get_proc_freq, .progress = pnv_progress, .machine_shutdown = pnv_shutdown, .power_save = power7_idle, @@ -284,4 +333,7 @@ define_machine(powernv) { #ifdef CONFIG_KEXEC .kexec_cpu_down = pnv_kexec_cpu_down, #endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pnv_memory_block_size, +#endif }; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index bf5fcd452168..0062a43a2e0d 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -31,6 +31,7 @@ #include <asm/xics.h> #include <asm/opal.h> #include <asm/runlatch.h> +#include <asm/code-patching.h> #include "powernv.h" @@ -50,8 +51,8 @@ static void pnv_smp_setup_cpu(int cpu) int pnv_smp_kick_cpu(int nr) { unsigned int pcpu = get_hard_smp_processor_id(nr); - unsigned long start_here = __pa(*((unsigned long *) - generic_secondary_smp_init)); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; BUG_ON(nr < 0 || nr >= NR_CPUS); @@ -158,17 +159,19 @@ static void pnv_smp_cpu_kill_self(void) mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { ppc64_runlatch_off(); - power7_nap(); + power7_nap(1); ppc64_runlatch_on(); - if (!generic_check_cpu_restart(cpu)) { + + /* Reenable IRQs briefly to clear the IPI that woke us */ + local_irq_enable(); + local_irq_disable(); + mb(); + + if (cpu_core_split_required()) + continue; + + if (!generic_check_cpu_restart(cpu)) DBG("CPU%d Unexpected exit while offline !\n", cpu); - /* We may be getting an IPI, so we re-enable - * interrupts to process it, it will be ignored - * since we aren't online (hopefully) - */ - local_irq_enable(); - local_irq_disable(); - } } mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S new file mode 100644 index 000000000000..39bb24aa8f34 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore-asm.S @@ -0,0 +1,95 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> + +#include "subcore.h" + + +_GLOBAL(split_core_secondary_loop) + /* + * r3 = u8 *state, used throughout the routine + * r4 = temp + * r5 = temp + * .. + * r12 = MSR + */ + mfmsr r12 + + /* Disable interrupts so SRR0/1 don't get trashed */ + li r4,0 + ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI + andc r4,r12,r4 + sync + mtmsrd r4 + + /* Switch to real mode and leave interrupts off */ + li r5, MSR_IR|MSR_DR + andc r5, r4, r5 + + LOAD_REG_ADDR(r4, real_mode) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + rfid + b . /* prevent speculative execution */ + +real_mode: + /* Grab values from unsplit SPRs */ + mfspr r6, SPRN_LDBAR + mfspr r7, SPRN_PMMAR + mfspr r8, SPRN_PMCR + mfspr r9, SPRN_RPR + mfspr r10, SPRN_SDR1 + + /* Order reading the SPRs vs telling the primary we are ready to split */ + sync + + /* Tell thread 0 we are in real mode */ + li r4, SYNC_STEP_REAL_MODE + stb r4, 0(r3) + + li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest + sldi r5, r5, 48 + + /* Loop until we see the split happen in HID0 */ +1: mfspr r4, SPRN_HID0 + and. r4, r4, r5 + beq 1b + + /* + * We only need to initialise the below regs once for each subcore, + * but it's simpler and harmless to do it on each thread. + */ + + /* Make sure various SPRS have sane values */ + li r4, 0 + mtspr SPRN_LPID, r4 + mtspr SPRN_PCR, r4 + mtspr SPRN_HDEC, r4 + + /* Restore SPR values now we are split */ + mtspr SPRN_LDBAR, r6 + mtspr SPRN_PMMAR, r7 + mtspr SPRN_PMCR, r8 + mtspr SPRN_RPR, r9 + mtspr SPRN_SDR1, r10 + + LOAD_REG_ADDR(r5, virtual_mode) + + /* Get out of real mode */ + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r12 + rfid + b . /* prevent speculative execution */ + +virtual_mode: + blr diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c new file mode 100644 index 000000000000..894ecb3eb596 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -0,0 +1,392 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv: " fmt + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/stop_machine.h> + +#include <asm/cputhreads.h> +#include <asm/kvm_ppc.h> +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/smp.h> + +#include "subcore.h" + + +/* + * Split/unsplit procedure: + * + * A core can be in one of three states, unsplit, 2-way split, and 4-way split. + * + * The mapping to subcores_per_core is simple: + * + * State | subcores_per_core + * ------------|------------------ + * Unsplit | 1 + * 2-way split | 2 + * 4-way split | 4 + * + * The core is split along thread boundaries, the mapping between subcores and + * threads is as follows: + * + * Unsplit: + * ---------------------------- + * Subcore | 0 | + * ---------------------------- + * Thread | 0 1 2 3 4 5 6 7 | + * ---------------------------- + * + * 2-way split: + * ------------------------------------- + * Subcore | 0 | 1 | + * ------------------------------------- + * Thread | 0 1 2 3 | 4 5 6 7 | + * ------------------------------------- + * + * 4-way split: + * ----------------------------------------- + * Subcore | 0 | 1 | 2 | 3 | + * ----------------------------------------- + * Thread | 0 1 | 2 3 | 4 5 | 6 7 | + * ----------------------------------------- + * + * + * Transitions + * ----------- + * + * It is not possible to transition between either of the split states, the + * core must first be unsplit. The legal transitions are: + * + * ----------- --------------- + * | | <----> | 2-way split | + * | | --------------- + * | Unsplit | + * | | --------------- + * | | <----> | 4-way split | + * ----------- --------------- + * + * Unsplitting + * ----------- + * + * Unsplitting is the simpler procedure. It requires thread 0 to request the + * unsplit while all other threads NAP. + * + * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells + * the hardware that if all threads except 0 are napping, the hardware should + * unsplit the core. + * + * Non-zero threads are sent to a NAP loop, they don't exit the loop until they + * see the core unsplit. + * + * Core 0 spins waiting for the hardware to see all the other threads napping + * and perform the unsplit. + * + * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them + * out of NAP. They will then see the core unsplit and exit the NAP loop. + * + * Splitting + * --------- + * + * The basic splitting procedure is fairly straight forward. However it is + * complicated by the fact that after the split occurs, the newly created + * subcores are not in a fully initialised state. + * + * Most notably the subcores do not have the correct value for SDR1, which + * means they must not be running in virtual mode when the split occurs. The + * subcores have separate timebases SPRs but these are pre-synchronised by + * opal. + * + * To begin with secondary threads are sent to an assembly routine. There they + * switch to real mode, so they are immune to the uninitialised SDR1 value. + * Once in real mode they indicate that they are in real mode, and spin waiting + * to see the core split. + * + * Thread 0 waits to see that all secondaries are in real mode, and then begins + * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which + * prevents the hardware from unsplitting. Then it sets the appropriate HID bit + * to request the split, and spins waiting to see that the split has happened. + * + * Concurrently the secondaries will notice the split. When they do they set up + * their SPRs, notably SDR1, and then they can return to virtual mode and exit + * the procedure. + */ + +/* Initialised at boot by subcore_init() */ +static int subcores_per_core; + +/* + * Used to communicate to offline cpus that we want them to pop out of the + * offline loop and do a split or unsplit. + * + * 0 - no split happening + * 1 - unsplit in progress + * 2 - split to 2 in progress + * 4 - split to 4 in progress + */ +static int new_split_mode; + +static cpumask_var_t cpu_offline_mask; + +struct split_state { + u8 step; + u8 master; +}; + +static DEFINE_PER_CPU(struct split_state, split_state); + +static void wait_for_sync_step(int step) +{ + int i, cpu = smp_processor_id(); + + for (i = cpu + 1; i < cpu + threads_per_core; i++) + while(per_cpu(split_state, i).step < step) + barrier(); + + /* Order the wait loop vs any subsequent loads/stores. */ + mb(); +} + +static void unsplit_core(void) +{ + u64 hid0, mask; + int i, cpu; + + mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + while (mfspr(SPRN_HID0) & mask) + power7_nap(0); + + per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; + return; + } + + hid0 = mfspr(SPRN_HID0); + hid0 &= ~HID0_POWER8_DYNLPARDIS; + mtspr(SPRN_HID0, hid0); + + while (mfspr(SPRN_HID0) & mask) + cpu_relax(); + + /* Wake secondaries out of NAP */ + for (i = cpu + 1; i < cpu + threads_per_core; i++) + smp_send_reschedule(i); + + wait_for_sync_step(SYNC_STEP_UNSPLIT); +} + +static void split_core(int new_mode) +{ + struct { u64 value; u64 mask; } split_parms[2] = { + { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE }, + { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE } + }; + int i, cpu; + u64 hid0; + + /* Convert new_mode (2 or 4) into an index into our parms array */ + i = (new_mode >> 1) - 1; + BUG_ON(i < 0 || i > 1); + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + split_core_secondary_loop(&per_cpu(split_state, cpu).step); + return; + } + + wait_for_sync_step(SYNC_STEP_REAL_MODE); + + /* Write new mode */ + hid0 = mfspr(SPRN_HID0); + hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; + mtspr(SPRN_HID0, hid0); + + /* Wait for it to happen */ + while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) + cpu_relax(); +} + +static void cpu_do_split(int new_mode) +{ + /* + * At boot subcores_per_core will be 0, so we will always unsplit at + * boot. In the usual case where the core is already unsplit it's a + * nop, and this just ensures the kernel's notion of the mode is + * consistent with the hardware. + */ + if (subcores_per_core != 1) + unsplit_core(); + + if (new_mode != 1) + split_core(new_mode); + + mb(); + per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED; +} + +bool cpu_core_split_required(void) +{ + smp_rmb(); + + if (!new_split_mode) + return false; + + cpu_do_split(new_split_mode); + + return true; +} + +static int cpu_update_split_mode(void *data) +{ + int cpu, new_mode = *(int *)data; + + if (this_cpu_ptr(&split_state)->master) { + new_split_mode = new_mode; + smp_wmb(); + + cpumask_andnot(cpu_offline_mask, cpu_present_mask, + cpu_online_mask); + + /* This should work even though the cpu is offline */ + for_each_cpu(cpu, cpu_offline_mask) + smp_send_reschedule(cpu); + } + + cpu_do_split(new_mode); + + if (this_cpu_ptr(&split_state)->master) { + /* Wait for all cpus to finish before we touch subcores_per_core */ + for_each_present_cpu(cpu) { + if (cpu >= setup_max_cpus) + break; + + while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED) + barrier(); + } + + new_split_mode = 0; + + /* Make the new mode public */ + subcores_per_core = new_mode; + threads_per_subcore = threads_per_core / subcores_per_core; + + /* Make sure the new mode is written before we exit */ + mb(); + } + + return 0; +} + +static int set_subcores_per_core(int new_mode) +{ + struct split_state *state; + int cpu; + + if (kvm_hv_mode_active()) { + pr_err("Unable to change split core mode while KVM active.\n"); + return -EBUSY; + } + + /* + * We are only called at boot, or from the sysfs write. If that ever + * changes we'll need a lock here. + */ + BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3); + + for_each_present_cpu(cpu) { + state = &per_cpu(split_state, cpu); + state->step = SYNC_STEP_INITIAL; + state->master = 0; + } + + get_online_cpus(); + + /* This cpu will update the globals before exiting stop machine */ + this_cpu_ptr(&split_state)->master = 1; + + /* Ensure state is consistent before we call the other cpus */ + mb(); + + stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + + put_online_cpus(); + + return 0; +} + +static ssize_t __used store_subcores_per_core(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int rc; + + /* We are serialised by the attribute lock */ + + rc = sscanf(buf, "%lx", &val); + if (rc != 1) + return -EINVAL; + + switch (val) { + case 1: + case 2: + case 4: + if (subcores_per_core == val) + /* Nothing to do */ + goto out; + break; + default: + return -EINVAL; + } + + rc = set_subcores_per_core(val); + if (rc) + return rc; + +out: + return count; +} + +static ssize_t show_subcores_per_core(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%x\n", subcores_per_core); +} + +static DEVICE_ATTR(subcores_per_core, 0644, + show_subcores_per_core, store_subcores_per_core); + +static int subcore_init(void) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + + /* + * We need all threads in a core to be present to split/unsplit so + * continue only if max_cpus are aligned to threads_per_core. + */ + if (setup_max_cpus % threads_per_core) + return 0; + + BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL)); + + set_subcores_per_core(1); + + return device_create_file(cpu_subsys.dev_root, + &dev_attr_subcores_per_core); +} +machine_device_initcall(powernv, subcore_init); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h new file mode 100644 index 000000000000..148abc91debf --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -0,0 +1,18 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are ordered and tested with <= */ +#define SYNC_STEP_INITIAL 0 +#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */ +#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ +#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ + +#ifndef __ASSEMBLY__ +void split_core_secondary_loop(u8 *state); +#endif |