diff options
Diffstat (limited to 'arch/x86/kernel')
25 files changed, 518 insertions, 150 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 488be461a380..60cc4058ed5f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled); int acpi_noirq; /* skip ACPI IRQ initialization */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); -int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; @@ -1501,9 +1500,8 @@ void __init acpi_boot_table_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return; /* @@ -1534,9 +1532,8 @@ int __init early_acpi_boot_init(void) { /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; /* @@ -1554,9 +1551,8 @@ int __init acpi_boot_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1591,21 +1587,12 @@ static int __init parse_acpi(char *arg) /* acpi=force to over-ride black-list */ else if (strcmp(arg, "force") == 0) { acpi_force = 1; - acpi_ht = 1; acpi_disabled = 0; } /* acpi=strict disables out-of-spec workarounds */ else if (strcmp(arg, "strict") == 0) { acpi_strict = 1; } - /* Limit ACPI just to boot-time to enable HT */ - else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) { - printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); - disable_acpi(); - } - acpi_ht = 1; - } /* acpi=rsdt use RSDT instead of XSDT */ else if (strcmp(arg, "rsdt") == 0) { acpi_rsdt_forced = 1; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index f9961034e557..82e508677b91 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); - if (strncmp(str, "sci_force_enable", 16) == 0) - acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e01618..c02cc692985c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -51,6 +51,7 @@ #include <asm/smp.h> #include <asm/mce.h> #include <asm/kvm_para.h> +#include <asm/tsc.h> unsigned int num_processors; @@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned int value; - int i, j; + unsigned int value, queued; + int i, j, acked = 0; + unsigned long long tsc = 0, ntsc; + long long max_loops = cpu_khz; + + if (cpu_has_tsc) + rdtscll(tsc); if (disable_apic) { arch_disable_smp_support(); @@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void) * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) { + ack_APIC_irq(); + acked++; + } + } } - } + if (acked > 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1c00d0b1692..68e4a6f2211e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1084,6 +1084,20 @@ static void clear_all_debug_regs(void) } } +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1107,9 +1121,9 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(node_number) == 0 && - cpu_to_node(cpu) != NUMA_NO_NODE) - percpu_write(node_number, cpu_to_node(cpu)); + if (cpu != 0 && percpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); #endif me = current; @@ -1174,18 +1188,8 @@ void __cpuinit cpu_init(void) load_TR_desc(); load_LDT(&init_mm.context); -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else -#endif - clear_all_debug_regs(); + clear_all_debug_regs(); + dbg_restore_debug_regs(); fpu_init(); @@ -1239,6 +1243,7 @@ void __cpuinit cpu_init(void) #endif clear_all_debug_regs(); + dbg_restore_debug_regs(); /* * Force FPU initialization: diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6f3dc8fbbfdc..7ec2123838e6 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = { * simply keep the boost-disable flag in sync with the current global * state. */ -static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) +static int cpb_notify(struct notifier_block *nb, unsigned long action, + void *hcpu) { unsigned cpu = (long)hcpu; u32 lo, hi; @@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpb_nb = { +static struct notifier_block cpb_nb = { .notifier_call = cpb_notify, }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 4ac6d48fe11b..bb34b03af252 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += mce-apei.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c new file mode 100644 index 000000000000..745b54f9be89 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -0,0 +1,138 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/cper.h> +#include <acpi/apei.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + /* Only corrected MC is reported */ + if (!corrected) + return; + + mce_setup(&m); + m.bank = 1; + /* Fake a memory read corrected error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.addr = mem_err->physical_addr; + mce_log(&m); + mce_notify_irq(); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SER_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SER_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + ssize_t len; + + len = erst_read_next(&rcd.hdr, sizeof(rcd)); + if (len <= 0) + return len; + /* Can not skip other records in storage via ERST unless clear them */ + else if (len != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { + if (printk_ratelimit()) + pr_warning( + "MCE-APEI: Can not skip the unknown record in ERST"); + return -EIO; + } + + memcpy(m, &rcd.mce, sizeof(*m)); + *record_id = rcd.hdr.record_id; + + return sizeof(*m); +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 32996f9fab67..fefcc69ee8b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,3 +28,26 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a355ddcc64b..707165dbc203 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -264,7 +264,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { - int i; + int i, apei_err = 0; if (!fake_panic) { /* @@ -287,8 +287,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp) struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; - if (!(m->status & MCI_STATUS_UC)) + if (!(m->status & MCI_STATUS_UC)) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -297,11 +300,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp) continue; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) + if (!final || memcmp(m, final, sizeof(struct mce))) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } - if (final) + if (final) { print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); print_mce_tail(); @@ -1493,6 +1502,43 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1506,15 +1552,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; err = 0; prev = 0; @@ -1562,10 +1612,15 @@ timeout: memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } + + if (err) + err = -EFAULT; + +out: mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; + return err ? err : buf - ubuf; } static unsigned int mce_poll(struct file *file, poll_table *wait) @@ -1573,6 +1628,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) poll_wait(file, &mce_wait, wait); if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 81c499eceb21..e1a0a3bf9716 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, mutex_unlock(&therm_cpu_lock); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fd4db0db3708..c77586061bcb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1717,7 +1717,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski */ regs->bp = rewind_frame_pointer(skip + 1); regs->cs = __KERNEL_CS; - local_save_flags(regs->flags); + /* + * We abuse bit 3 to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ + regs->flags = 0; } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 424fc8de68e4..ae85d69644d1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -465,15 +465,21 @@ out: return rc; } -static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) +static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) { - unsigned long dummy; + int overflow = 0; + u32 low, high; - rdmsrl(hwc->config_base + hwc->idx, dummy); - if (dummy & P4_CCCR_OVF) { + rdmsr(hwc->config_base + hwc->idx, low, high); + + /* we need to check high bit for unflagged overflows */ + if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { + overflow = 1; (void)checking_wrmsrl(hwc->config_base + hwc->idx, - ((u64)dummy) & ~P4_CCCR_OVF); + ((u64)low) & ~P4_CCCR_OVF); } + + return overflow; } static inline void p4_pmu_disable_event(struct perf_event *event) @@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) WARN_ON_ONCE(hwc->idx != idx); - /* - * FIXME: Redundant call, actually not needed - * but just to check if we're screwed - */ - p4_pmu_clear_cccr_ovf(hwc); + /* it might be unflagged overflow */ + handled = p4_pmu_clear_cccr_ovf(hwc); val = x86_perf_event_update(event); - if (val & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue; - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; + /* event overflow for sure */ + data.period = event->hw.last_period; if (!x86_perf_event_set_period(event)) continue; @@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) /* * ESCR address hashing is tricky, ESCRs are not sequential - * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and + * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and * the metric between any ESCRs is laid in range [0xa0,0xe1] * * so we make ~70% filled hashtable @@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr) { unsigned int idx = P4_ESCR_MSR_IDX(addr); - if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || - !p4_escr_table[idx])) { + if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || + !p4_escr_table[idx] || + p4_escr_table[idx] != addr)) { WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); return -1; } @@ -762,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); struct hw_perf_event *hwc; struct p4_event_bind *bind; unsigned int i, thread, num; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 8b862d5900fe..1b7b31ab7d86 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, cpuid_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata cpuid_class_cpu_notifier = diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b9c830c12b4a..fa99bae75ace 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n) writew(0x720, VGABASE + 2*(max_xpos*j + i)); current_ypos = max_ypos-1; } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif if (c == '\n') { current_xpos = 0; current_ypos++; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b2258ca91003..4f4af75b9482 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -47,20 +47,8 @@ #include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> - #include <asm/apic.h> -/* - * Put the error code here just in case the user cares: - */ -static int gdb_x86errcode; - -/* - * Likewise, the vector number here (since GDB only gets the signal - * number through the usual means, and that's not very specific): - */ -static int gdb_x86vector = -1; - /** * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs * @gdb_regs: A pointer to hold the registers in the order GDB wants. @@ -211,6 +199,8 @@ static struct hw_breakpoint { struct perf_event **pev; } breakinfo[4]; +static unsigned long early_dr7; + static void kgdb_correct_hw_break(void) { int breakno; @@ -222,6 +212,14 @@ static void kgdb_correct_hw_break(void) int cpu = raw_smp_processor_id(); if (!breakinfo[breakno].enabled) continue; + if (dbg_is_early) { + set_debugreg(breakinfo[breakno].addr, breakno); + early_dr7 |= encode_dr7(breakno, + breakinfo[breakno].len, + breakinfo[breakno].type); + set_debugreg(early_dr7, 7); + continue; + } bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); info = counter_arch_bp(bp); if (bp->attr.disabled != 1) @@ -236,7 +234,8 @@ static void kgdb_correct_hw_break(void) if (!val) bp->attr.disabled = 0; } - hw_breakpoint_restore(); + if (!dbg_is_early) + hw_breakpoint_restore(); } static int hw_break_reserve_slot(int breakno) @@ -245,6 +244,9 @@ static int hw_break_reserve_slot(int breakno) int cnt = 0; struct perf_event **pevent; + if (dbg_is_early) + return 0; + for_each_online_cpu(cpu) { cnt++; pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); @@ -270,6 +272,9 @@ static int hw_break_release_slot(int breakno) struct perf_event **pevent; int cpu; + if (dbg_is_early) + return 0; + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); if (dbg_release_bp_slot(*pevent)) @@ -314,7 +319,11 @@ static void kgdb_remove_all_hw_break(void) bp = *per_cpu_ptr(breakinfo[i].pev, cpu); if (bp->attr.disabled == 1) continue; - arch_uninstall_hw_breakpoint(bp); + if (dbg_is_early) + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + else + arch_uninstall_hw_breakpoint(bp); bp->attr.disabled = 1; } } @@ -391,6 +400,11 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) for (i = 0; i < 4; i++) { if (!breakinfo[i].enabled) continue; + if (dbg_is_early) { + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + continue; + } bp = *per_cpu_ptr(breakinfo[i].pev, cpu); if (bp->attr.disabled == 1) continue; @@ -399,23 +413,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) } } -/** - * kgdb_post_primary_code - Save error vector/code numbers. - * @regs: Original pt_regs. - * @e_vector: Original error vector. - * @err_code: Original error code. - * - * This is needed on architectures which support SMP and KGDB. - * This function is called after all the slave cpus have been put - * to a know spin state and the primary CPU has control over KGDB. - */ -void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) -{ - /* primary processor is completely in the debugger */ - gdb_x86vector = e_vector; - gdb_x86errcode = err_code; -} - #ifdef CONFIG_SMP /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern @@ -567,7 +564,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_DONE; } - if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs)) + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) return NOTIFY_DONE; /* Must touch watchdog before return to normal operation */ @@ -575,6 +572,26 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_STOP; } +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP +int kgdb_ll_trap(int cmd, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + return __kgdb_notify(&args, cmd); +} +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ + static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) { @@ -605,14 +622,15 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { + return register_die_notifier(&kgdb_notifier); +} + +void kgdb_arch_late(void) +{ int i, cpu; - int ret; struct perf_event_attr attr; struct perf_event **pevent; - ret = register_die_notifier(&kgdb_notifier); - if (ret != 0) - return ret; /* * Pre-allocate the hw breakpoint structions in the non-atomic * portion of kgdb because this operation requires mutexs to @@ -624,12 +642,15 @@ int kgdb_arch_init(void) attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) + continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); if (IS_ERR(breakinfo[i].pev)) { - printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + printk(KERN_ERR "kgdb: Could not allocate hw" + "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; kgdb_arch_exit(); - return -1; + return; } for_each_online_cpu(cpu) { pevent = per_cpu_ptr(breakinfo[i].pev, cpu); @@ -640,7 +661,6 @@ int kgdb_arch_init(void) } } } - return ret; } /** @@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) return instruction_pointer(regs); } +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + struct kgdb_arch arch_kgdb_ops = { /* Breakpoint instruction: */ .gdb_bpt_instr = { 0xcc }, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index feaeb0d3aa4f..eb9b76c716c2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,6 +29,8 @@ #define KVM_SCALE 22 static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static int parse_no_kvmclock(char *arg) { @@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void) low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + + native_write_msr(msr_kvm_wall_clock, low, high); vcpu_time = &get_cpu_var(hv_clock); pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); @@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt) high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); + + return native_write_msr_safe(msr_kvm_system_time, low, high); } #ifdef CONFIG_X86_LOCAL_APIC @@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void) #ifdef CONFIG_KEXEC static void kvm_crash_shutdown(struct pt_regs *regs) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_crash_shutdown(regs); } #endif static void kvm_shutdown(void) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_shutdown(); } @@ -181,27 +185,37 @@ void __init kvmclock_init(void) if (!kvm_para_available()) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock("boot clock")) - return; - pv_time_ops.sched_clock = kvm_clock_read; - x86_platform.calibrate_tsc = kvm_get_tsc_khz; - x86_platform.get_wallclock = kvm_get_wallclock; - x86_platform.set_wallclock = kvm_set_wallclock; + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + if (kvm_register_clock("boot clock")) + return; + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = - kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #endif - machine_ops.shutdown = kvm_shutdown; + machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC - machine_ops.crash_shutdown = kvm_crash_shutdown; + machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); - pv_info.paravirt_enabled = 1; - pv_info.name = "KVM"; - } + kvm_get_preset_lpj(); + clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 2cd8c544e41a..fa6551d36c10 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -260,6 +260,7 @@ static void microcode_dev_exit(void) } MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4d4468e9f47c..7bf2dc4c8f70 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, msr_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata msr_class_cpu_notifier = { diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7d2829dde20e..a5bc528d4328 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = { .free_coherent = swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 03801f2f761f..239427ca02af 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -31,8 +31,16 @@ struct pvclock_shadow_time { u32 tsc_to_nsec_mul; int tsc_shift; u32 version; + u8 flags; }; +static u8 valid_flags __read_mostly = 0; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; + dst->flags = src->flags; rmb(); /* test version after fetching data */ } while ((src->version & 1) || (dst->version != src->version)); @@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) return pv_tsc_khz; } +static atomic64_t last_value = ATOMIC64_INIT(0); + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; unsigned version; cycle_t ret, offset; + u64 last; do { version = pvclock_get_time_values(&shadow, src); @@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) barrier(); } while (version != src->version); + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + return ret; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4851eff57b3..b4ae4acbd031 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), }, }, + /* + * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so + * match on the product name. + */ + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), + }, + }, #endif {} }; @@ -725,6 +736,7 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + early_trap_init(); early_cpu_init(); early_ioremap_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef6370b00e70..a867940a6dfc 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -265,10 +265,10 @@ void __init setup_per_cpu_areas(void) #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) /* - * make sure boot cpu node_number is right, when boot cpu is on the + * make sure boot cpu numa_node is right, when boot cpu is on the * node that doesn't have mem installed */ - per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); + set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id)); #endif /* Setup node to cpumask map */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 763d815e27a0..37462f1ddba5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; - if (setup_possible_cpus == -1) - possible = num_processors + disabled_cpus; - else + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else possible = setup_possible_cpus; total_cpus = max_t(int, possible, num_processors + disabled_cpus); @@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void) possible = nr_cpu_ids; } +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + printk(KERN_WARNING + "%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) set_cpu_possible(i, true); + for (; i < NR_CPUS; i++) + set_cpu_possible(i, false); nr_cpu_ids = possible; } diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index cc2c60474fd0..c2f1b26141e2 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -46,6 +46,7 @@ /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 02cfb9b8f5b1..142d70c74b02 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> +#include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/ptrace.h> @@ -451,6 +452,11 @@ void restart_nmi(void) /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) @@ -802,6 +808,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif +/* Set of traps needed for early debugging. */ +void __init early_trap_init(void) +{ + set_intr_gate_ist(1, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + set_intr_gate(14, &page_fault); + load_idt(&idt_descr); +} + void __init trap_init(void) { int i; @@ -815,10 +831,7 @@ void __init trap_init(void) #endif set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); /* int4 can be called from all */ set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); @@ -834,7 +847,6 @@ void __init trap_init(void) set_intr_gate(11, &segment_not_present); set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); |