diff options
Diffstat (limited to 'arch/x86/kernel')
39 files changed, 566 insertions, 245 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a5408b965c9d..9b0a34e2cd79 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,6 +36,8 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-$(CONFIG_PREEMPT) += preempt.o + obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index c9876efecafb..af5b08ab3b71 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -40,7 +40,7 @@ #include <asm/fixmap.h> #include <asm/apb_timer.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/time.h> #define APBT_CLOCKEVENT_RATING 110 @@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", - mrst_timer_options == MRST_TIMER_LAPIC_APBT ? + intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), 0, apbt_freq); /* Firmware does EOI handling for us. */ adev->timer->eoi = NULL; - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { global_clock_event = &adev->timer->ced; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); @@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ @@ -340,7 +340,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 28610822fb3c..9f6b9341950f 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,6 @@ void common(void) { OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_preempt_count, thread_info, preempt_count); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 903a264af981..3daece79a142 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -823,8 +823,8 @@ static const struct cpu_dev amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [3] = "486 DX/2", [7] = "486 DX/2-WB", @@ -835,7 +835,7 @@ static const struct cpu_dev amd_cpu_dev = { } }, }, - .c_size_cache = amd_size_cache, + .legacy_cache_size = amd_size_cache, #endif .c_early_init = early_init_amd, .c_detect_tlb = cpu_detect_tlb_amd, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index fbf6c3bc2400..8d5652dc99dd 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -468,10 +468,10 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_32 static unsigned int centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { -#ifdef CONFIG_X86_32 /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) size >>= 8; @@ -484,16 +484,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) if ((c->x86 == 6) && (c->x86_model == 9) && (c->x86_mask == 1) && (size == 65)) size -= 1; -#endif return size; } +#endif static const struct cpu_dev centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, - .c_size_cache = centaur_size_cache, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif .c_x86_vendor = X86_VENDOR_CENTAUR, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2793d1f095a2..6abc172b8258 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -346,7 +346,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) /* Look up CPU names by table lookup. */ static const char *table_lookup_model(struct cpuinfo_x86 *c) { - const struct cpu_model_info *info; +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -354,13 +355,14 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) if (!this_cpu) return NULL; - info = this_cpu->c_models; + info = this_cpu->legacy_models; - while (info && info->family) { + while (info->family) { if (info->family == c->x86) return info->model_names[c->x86_model]; info++; } +#endif return NULL; /* Not found */ } @@ -450,8 +452,8 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); #else /* do processor-specific cache resizing */ - if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c, l2size); + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); /* Allow user to override all this if necessary. */ if (cachesize_override != -1) @@ -1095,6 +1097,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); /* @@ -1169,6 +1174,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4041c24ae7db..c37dc37e8317 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,12 +1,6 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H -struct cpu_model_info { - int vendor; - int family; - const char *model_names[16]; -}; - /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -14,15 +8,23 @@ struct cpu_dev { /* some have two possibilities for cpuid string */ const char *c_ident[2]; - struct cpu_model_info c_models[4]; - void (*c_early_init)(struct cpuinfo_x86 *); void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif }; struct _tlb_table { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ec7299566f79..dc1ec0dff939 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -665,8 +665,8 @@ static const struct cpu_dev intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [0] = "486 DX-25/33", [1] = "486 DX-50", @@ -679,7 +679,7 @@ static const struct cpu_dev intel_cpu_dev = { [9] = "486 DX/4-WB" } }, - { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = + { .family = 5, .model_names = { [0] = "Pentium 60/66 A-step", [1] = "Pentium 60/66", @@ -690,7 +690,7 @@ static const struct cpu_dev intel_cpu_dev = { [8] = "Mobile Pentium MMX" } }, - { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = + { .family = 6, .model_names = { [0] = "Pentium Pro A-step", [1] = "Pentium Pro", @@ -704,7 +704,7 @@ static const struct cpu_dev intel_cpu_dev = { [11] = "Pentium III (Tualatin)", } }, - { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = + { .family = 15, .model_names = { [0] = "Pentium 4 (Unknown)", [1] = "Pentium 4 (Willamette)", @@ -714,7 +714,7 @@ static const struct cpu_dev intel_cpu_dev = { } }, }, - .c_size_cache = intel_size_cache, + .legacy_cache_size = intel_size_cache, #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 71a39f3621ba..9f7ca266864a 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -15,6 +15,7 @@ #include <linux/clocksource.h> #include <linux/module.h> #include <linux/hardirq.h> +#include <linux/efi.h> #include <linux/interrupt.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@ -23,6 +24,8 @@ #include <asm/desc.h> #include <asm/idle.h> #include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -76,6 +79,30 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_frequency = hv_lapic_frequency; + printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); + + /* + * On Hyper-V, when we are booting off an EFI firmware stack, + * we do not have many legacy devices including PIC, PIT etc. + */ + if (efi_enabled(EFI_BOOT)) { + printk(KERN_INFO "HyperV: Using null_legacy_pic\n"); + legacy_pic = &null_legacy_pic; + } + } +#endif + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9d8449158cf9..8e132931614d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void) static int __kprobes perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - int ret; u64 start_clock; u64 finish_clock; + int ret; if (!atomic_read(&active_events)) return NMI_DONE; - start_clock = local_clock(); + start_clock = sched_clock(); ret = x86_pmu.handle_irq(regs); - finish_clock = local_clock(); + finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); @@ -1989,7 +1989,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; if (!valid_user_frame(fp, sizeof(frame))) @@ -2041,7 +2041,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index cc16faae0538..fd00bb29425d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -164,6 +164,11 @@ struct cpu_hw_events { struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; /* + * Intel checkpoint mask + */ + u64 intel_cp_status; + + /* * manage shared (per-core, per-cpu) registers * used on Intel NHM/WSM/SNB */ @@ -440,6 +445,7 @@ struct x86_pmu { int lbr_nr; /* hardware stack size */ u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ + bool lbr_double_abort; /* duplicated lbr aborts */ /* * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f31a1655d1ff..0fa4f242f050 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -190,9 +190,9 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; -EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); -EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); struct attribute *nhm_events_attrs[] = { EVENT_PTR(mem_ld_nhm), @@ -1184,6 +1184,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) wrmsrl(hwc->config_base, ctrl_val); } +static inline bool event_is_checkpointed(struct perf_event *event) +{ + return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -1197,6 +1202,7 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + cpuc->intel_cp_status &= ~(1ull << hwc->idx); /* * must disable before any actual event @@ -1271,6 +1277,9 @@ static void intel_pmu_enable_event(struct perf_event *event) if (event->attr.exclude_guest) cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + if (unlikely(event_is_checkpointed(event))) + cpuc->intel_cp_status |= (1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); return; @@ -1289,6 +1298,17 @@ static void intel_pmu_enable_event(struct perf_event *event) int intel_pmu_save_and_restart(struct perf_event *event) { x86_perf_event_update(event); + /* + * For a checkpointed counter always reset back to 0. This + * avoids a situation where the counter overflows, aborts the + * transaction and is then set back to shortly before the + * overflow, and overflows and aborts again. + */ + if (unlikely(event_is_checkpointed(event))) { + /* No race with NMIs because the counter should not be armed */ + wrmsrl(event->hw.event_base, 0); + local64_set(&event->hw.prev_count, 0); + } return x86_perf_event_set_period(event); } @@ -1372,6 +1392,13 @@ again: x86_pmu.drain_pebs(regs); } + /* + * Checkpointed counters can lead to 'spurious' PMIs because the + * rollback caused by the PMI will have cleared the overflow status + * bit. Therefore always force probe these counters. + */ + status |= cpuc->intel_cp_status; + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; @@ -1837,6 +1864,20 @@ static int hsw_hw_config(struct perf_event *event) event->attr.precise_ip > 0)) return -EOPNOTSUPP; + if (event_is_checkpointed(event)) { + /* + * Sampling of checkpointed events can cause situations where + * the CPU constantly aborts because of a overflow, which is + * then checkpointed back and ignored. Forbid checkpointing + * for sampling. + * + * But still allow a long sampling period, so that perf stat + * from KVM works. + */ + if (event->attr.sample_period > 0 && + event->attr.sample_period < 0x7fffffff) + return -EOPNOTSUPP; + } return 0; } @@ -2182,10 +2223,36 @@ static __init void intel_nehalem_quirk(void) } } -EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_capacity), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_commit), + EVENT_PTR(el_abort), + EVENT_PTR(el_capacity), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), EVENT_PTR(mem_ld_hsw), EVENT_PTR(mem_st_hsw), NULL @@ -2452,6 +2519,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.lbr_double_abort = true; pr_cont("Haswell events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ab3ba1c1b7dd..ae96cfa5eddd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -12,6 +12,7 @@ #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_FIXUP_SIZE PAGE_SIZE /* * pebs_record_32 for p4 and core not supported @@ -182,18 +183,32 @@ struct pebs_record_nhm { * Same as pebs_record_nhm, with two additional fields. */ struct pebs_record_hsw { - struct pebs_record_nhm nhm; - /* - * Real IP of the event. In the Intel documentation this - * is called eventingrip. - */ - u64 real_ip; - /* - * TSX tuning information field: abort cycles and abort flags. - */ - u64 tsx_tuning; + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { + struct { + u32 cycles_last_block : 32, + hle_abort : 1, + rtm_abort : 1, + instruction_abort : 1, + non_instruction_abort : 1, + retry : 1, + data_conflict : 1, + capacity_writes : 1, + capacity_reads : 1; + }; + u64 value; }; +#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -214,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static DEFINE_PER_CPU(void *, insn_buffer); + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; int node = cpu_to_node(cpu); int max, thresh = 1; /* always use a single PEBS record */ - void *buffer; + void *buffer, *ibuffer; if (!x86_pmu.pebs) return 0; @@ -228,6 +245,19 @@ static int alloc_pebs_buffer(int cpu) if (unlikely(!buffer)) return -ENOMEM; + /* + * HSW+ already provides us the eventing ip; no need to allocate this + * buffer then. + */ + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { + kfree(buffer); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; + } + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; ds->pebs_buffer_base = (u64)(unsigned long)buffer; @@ -248,6 +278,9 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + kfree((void *)(unsigned long)ds->pebs_buffer_base); ds->pebs_buffer_base = 0; } @@ -715,6 +748,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; int is_64bit = 0; + void *kaddr; /* * We don't need to fixup if the PEBS assist is fault like @@ -738,7 +772,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) * unsigned math, either ip is before the start (impossible) or * the basic block is larger than 1 page (sanity) */ - if ((ip - to) > PAGE_SIZE) + if ((ip - to) > PEBS_FIXUP_SIZE) return 0; /* @@ -749,29 +783,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + if (!kernel_ip(ip)) { + int size, bytes; + u8 *buf = this_cpu_read(insn_buffer); + + size = ip - to; /* Must fit our buffer, see above */ + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != 0) + return 0; + + kaddr = buf; + } else { + kaddr = (void *)to; + } + do { struct insn insn; - u8 buf[MAX_INSN_SIZE]; - void *kaddr; old_to = to; - if (!kernel_ip(ip)) { - int bytes, size = MAX_INSN_SIZE; - - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != size) - return 0; - - kaddr = buf; - } else - kaddr = (void *)to; #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); + to += insn.length; + kaddr += insn.length; } while (to < ip); if (to == ip) { @@ -786,16 +824,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ + if (pebs->tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + return tsx.cycles_last_block; + } + return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ + u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + + /* For RTM XABORTs also log the abort code from AX */ + if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) + txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + return txn; +} + static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { /* - * We cast to pebs_record_nhm to get the load latency data - * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used + * We cast to the biggest pebs_record but are careful not to + * unconditionally access the 'extra' entries. */ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct pebs_record_nhm *pebs = __pebs; - struct pebs_record_hsw *pebs_hsw = __pebs; + struct pebs_record_hsw *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; u64 sample_type; @@ -854,7 +910,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.sp = pebs->sp; if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs.ip = pebs_hsw->real_ip; + regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) regs.flags |= PERF_EFLAGS_EXACT; @@ -862,9 +918,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.flags &= ~PERF_EFLAGS_EXACT; if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) data.addr = pebs->dla; + if (x86_pmu.intel_cap.pebs_format >= 2) { + /* Only set the TSX weight when no memory weight. */ + if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + data.weight = intel_hsw_weight(pebs); + + if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + data.txn = intel_hsw_transaction(pebs); + } + if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; @@ -913,17 +978,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) __intel_pmu_pebs_event(event, iregs, at); } -static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, - void *top) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; struct perf_event *event = NULL; + void *at, *top; u64 status = 0; int bit; + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + ds->pebs_index = ds->pebs_buffer_base; + if (unlikely(at > top)) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, + "Unexpected number of pebs records %ld\n", + (long)(top - at) / x86_pmu.pebs_record_size); + for (; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; @@ -951,61 +1033,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, } } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct pebs_record_nhm *at, *top; - int n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; - - ds->pebs_index = ds->pebs_buffer_base; - - n = top - at; - if (n <= 0) - return; - - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > x86_pmu.max_pebs_events, - "Unexpected number of pebs records %d\n", n); - - return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - -static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct pebs_record_hsw *at, *top; - int n; - - if (!x86_pmu.pebs_active) - return; - - at = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base; - top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index; - - n = top - at; - if (n <= 0) - return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > x86_pmu.max_pebs_events, - "Unexpected number of pebs records %d\n", n); - - return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - /* * BTS, PEBS probe and setup */ @@ -1040,7 +1067,7 @@ void intel_ds_init(void) case 2: pr_cont("PEBS fmt2%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); - x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw; + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a5005e..d82d155aca8c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); int i; + int out = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; @@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) } from = (u64)((((s64)from) << skip) >> skip); - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].mispred = mis; - cpuc->lbr_entries[i].predicted = pred; - cpuc->lbr_entries[i].in_tx = in_tx; - cpuc->lbr_entries[i].abort = abort; - cpuc->lbr_entries[i].reserved = 0; + /* + * Some CPUs report duplicated abort records, + * with the second entry not having an abort bit set. + * Skip them here. This loop runs backwards, + * so we need to undo the previous record. + * If the abort just happened outside the window + * the extra entry cannot be removed. + */ + if (abort && x86_pmu.lbr_double_abort && out > 0) + out--; + + cpuc->lbr_entries[out].from = from; + cpuc->lbr_entries[out].to = to; + cpuc->lbr_entries[out].mispred = mis; + cpuc->lbr_entries[out].predicted = pred; + cpuc->lbr_entries[out].in_tx = in_tx; + cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].reserved = 0; + out++; } - cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.nr = out; } void intel_pmu_lbr_read(void) @@ -478,7 +491,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) /* may fail if text not present */ bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != size) + if (bytes != 0) return X86_BR_NONE; addr = buf; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4118f9f68315..29c248799ced 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -997,6 +997,20 @@ static int snbep_pci2phy_map_init(int devid) } } + if (!err) { + /* + * For PCI bus with no UBOX device, find the next bus + * that has UBOX device and use its mapping. + */ + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (pcibus_to_physid[bus] >= 0) + i = pcibus_to_physid[bus]; + else + pcibus_to_physid[bus] = i; + } + } + if (ubox_dev) pci_dev_put(ubox_dev); @@ -1099,6 +1113,24 @@ static struct attribute *ivt_uncore_qpi_formats_attr[] = { &format_attr_umask.attr, &format_attr_edge.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -1312,17 +1344,83 @@ static struct intel_uncore_type ivt_uncore_imc = { IVT_UNCORE_PCI_COMMON_INIT(), }; +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], + hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivt_uncore_irp_disable_event, + .enable_event = ivt_uncore_irp_enable_event, + .read_counter = ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = IVT_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivt_uncore_irp_ops, + .format_group = &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_qpi_enable_event, + .read_counter = snbep_uncore_pci_read_counter, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + static struct intel_uncore_type ivt_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &ivt_uncore_pci_ops, - .format_group = &ivt_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_qpi_ops, + .format_group = &ivt_uncore_qpi_format_group, }; static struct intel_uncore_type ivt_uncore_r2pcie = { @@ -1346,6 +1444,7 @@ static struct intel_uncore_type ivt_uncore_r3qpi = { enum { IVT_PCI_UNCORE_HA, IVT_PCI_UNCORE_IMC, + IVT_PCI_UNCORE_IRP, IVT_PCI_UNCORE_QPI, IVT_PCI_UNCORE_R2PCIE, IVT_PCI_UNCORE_R3QPI, @@ -1354,6 +1453,7 @@ enum { static struct intel_uncore_type *ivt_pci_uncores[] = { [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha, [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc, + [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp, [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi, [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie, [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi, @@ -1401,6 +1501,10 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), + }, { /* QPI0 Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), @@ -1429,6 +1533,16 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index aee6317b902f..06fe3ed8b851 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -11,15 +11,12 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", - cpumask_weight(cpu_core_mask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); - } + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); #endif } diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 202759a14121..75c5ad5d35cc 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -11,8 +11,8 @@ static const struct cpu_dev umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, - .c_models = { - { .vendor = X86_VENDOR_UMC, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [1] = "U5D", [2] = "U5S", diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e0e0841eef45..18677a90d6a3 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -127,12 +127,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_vmxoff(); cpu_emergency_svm_disable(); - lapic_shutdown(); #ifdef CONFIG_X86_IO_APIC /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ ioapic_zap_locks(); disable_IO_APIC(); #endif + lapic_shutdown(); #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index d15f575a861b..01d1c187c9f9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -14,9 +14,11 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> +#include <linux/efi.h> +#include <asm/efi.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -234,6 +236,11 @@ static int __init setup_early_printk(char *buf) early_console_register(&early_hsu_console, keep); } #endif +#ifdef CONFIG_EARLY_PRINTK_EFI + if (!strncmp(buf, "efi", 3)) + early_console_register(&early_efi_console, keep); +#endif + buf++; } return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0dcb0ceb6a2..fd1bc1b15e6d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -362,12 +362,9 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all need_resched: - movl TI_flags(%ebp), %ecx # need_resched set ? - testb $_TIF_NEED_RESCHED, %cl - jz restore_all + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b077f4cc225a..603be7c70675 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1103,10 +1103,8 @@ retint_signal: /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,TI_preempt_count(%rcx) + cmpl $0,PER_CPU_VAR(__preempt_count) jnz retint_restore_args - bt $TIF_NEED_RESCHED,TI_flags(%rcx) - jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq @@ -1342,7 +1340,7 @@ bad_gs: .previous /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) +ENTRY(do_softirq_own_stack) CFI_STARTPROC pushq_cfi %rbp CFI_REL_OFFSET rbp,0 @@ -1359,7 +1357,7 @@ ENTRY(call_softirq) decl PER_CPU_VAR(irq_count) ret CFI_ENDPROC -END(call_softirq) +END(do_softirq_own_stack) #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 06f87bece92a..c61a14a4a310 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -35,8 +35,8 @@ asmlinkage void __init i386_start_kernel(void) /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { - case X86_SUBARCH_MRST: - x86_mrst_early_setup(); + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); break; case X86_SUBARCH_CE4100: x86_ce4100_early_setup(); diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 0fa69127209a..05fd74f537d6 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9a5c460404dc..2e977b5d61dd 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -312,8 +312,7 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, - to 0x20-0x27 on i386 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7c..d7fcbedc9c43 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; - if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -131,7 +128,6 @@ void irq_ctx_init(int cpu) THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); per_cpu(hardirq_ctx, cpu) = irqctx; @@ -149,35 +145,21 @@ void irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void) { - unsigned long flags; struct thread_info *curctx; union irq_ctx *irqctx; u32 *isp; - if (in_interrupt()) - return; - - local_irq_save(flags); - - if (local_softirq_pending()) { - curctx = current_thread_info(); - irqctx = __this_cpu_read(softirq_ctx); - irqctx->tinfo.task = curctx->task; - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* build the stack frame on the softirq stack */ - isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + curctx = current_thread_info(); + irqctx = __this_cpu_read(softirq_ctx); + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; - call_on_stack(__do_softirq, isp); - /* - * Shouldn't happen, we returned above if in_interrupt(): - */ - WARN_ON_ONCE(softirq_count()); - } + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); - local_irq_restore(flags); + call_on_stack(__do_softirq, isp); } bool handle_irq(unsigned irq, struct pt_regs *regs) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d04d3ecded62..4d1c746892eb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) generic_handle_irq_desc(irq, desc); return true; } - - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - pending = local_softirq_pending(); - /* Switch to interrupt stack */ - if (pending) { - call_softirq(); - WARN_ON_ONCE(softirq_count()); - } - local_irq_restore(flags); -} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a0e2a8a80c94..b2046e4d0b59 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -609,7 +609,7 @@ static struct dentry *d_kvm_debug; struct dentry *kvm_init_debugfs(void) { - d_kvm_debug = debugfs_create_dir("kvm", NULL); + d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); if (!d_kvm_debug) printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 88458faea2f8..05266b5aae22 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -46,7 +46,7 @@ static struct class *msr_class; static loff_t msr_seek(struct file *file, loff_t offset, int orig) { loff_t ret; - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); mutex_lock(&inode->i_mutex); switch (orig) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ba77ebc2c353..6fcb49ce50a1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 u64 before, delta, whole_msecs; int remainder_ns, decimal_msecs, thishandled; - before = local_clock(); + before = sched_clock(); thishandled = a->handler(type, regs); handled += thishandled; - delta = local_clock() - before; + delta = sched_clock() - before; trace_nmi_handler(a->handler, (int)delta, thishandled); if (delta < nmi_longest_ns) diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 000000000000..ca7f0d58a87d --- /dev/null +++ b/arch/x86/kernel/preempt.S @@ -0,0 +1,25 @@ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/calling.h> + +ENTRY(___preempt_schedule) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule + RESTORE_ALL + ret + CFI_ENDPROC + +#ifdef CONFIG_CONTEXT_TRACKING + +ENTRY(___preempt_schedule_context) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule_context + RESTORE_ALL + ret + CFI_ENDPROC + +#endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c83516be1052..3fb8d95ab8b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -391,9 +391,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be * called with interrupts disabled. */ - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 884f98f69354..c2ec1aa6d454 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -292,6 +292,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) set_iopl_mask(next->iopl); /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + + /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb1dc51bab05..45ab4d6fc8a7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); + /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7e920bff99a3..618ce264b237 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -550,6 +550,10 @@ static void native_machine_emergency_restart(void) void native_machine_shutdown(void) { /* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif + #ifdef CONFIG_SMP /* * Stop all of the others. Also disable the local irq to @@ -562,10 +566,6 @@ void native_machine_shutdown(void) lapic_shutdown(); -#ifdef CONFIG_X86_IO_APIC - disable_IO_APIC(); -#endif - #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0aa29394ed6f..ca9622a25e95 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -12,7 +12,7 @@ #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h> #include <asm/rtc.h> #ifdef CONFIG_X86_32 @@ -189,9 +189,17 @@ static __init int add_rtc_cmos(void) return 0; /* Intel MID platforms don't have ioport rtc */ - if (mrst_identify_cpu()) + if (intel_mid_identify_cpu()) return -ENODEV; +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + /* This warning can likely go away again in a year or two. */ + pr_info("ACPI: not registering RTC platform device\n"); + return -ENODEV; + } +#endif + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6cacab671f9b..2a165580fa16 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,11 +73,10 @@ #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> - #include <asm/smpboot_hooks.h> #include <asm/i8259.h> - #include <asm/realmode.h> +#include <asm/misc.h> /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -648,22 +647,46 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } +void smp_announce(void) +{ + int num_nodes = num_online_nodes(); + + printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", + num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); +} + /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { static int current_node = -1; int node = early_cpu_to_node(cpu); - int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS); + static int width, node_width; + + if (!width) + width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + + if (!node_width) + node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + + if (cpu == 1) + printk(KERN_INFO "x86: Booting SMP configuration:\n"); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" OK\n"); + pr_cont("\n"); current_node = node; - pr_info("Booting Node %3d, Processors ", node); + + printk(KERN_INFO ".... node %*s#%d, CPUs: ", + node_width - num_digits(node), " ", node); } - pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : ""); - return; + + /* Add padding for the BSP */ + if (cpu == 1) + pr_cont("%*s", width + 1, " "); + + pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); + } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", node, cpu, apicid); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8c8093b146ca..729aa779ff75 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - inc_preempt_count(); + preempt_count_inc(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - dec_preempt_count(); + preempt_count_dec(); } static int __kprobes diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 10c4f3006afd..da6b35a98260 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -199,6 +199,15 @@ SECTIONS __x86_cpu_dev_end = .; } +#ifdef CONFIG_X86_INTEL_MID + .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \ + LOAD_OFFSET) { + __x86_intel_mid_dev_start = .; + *(.x86_intel_mid_dev.init) + __x86_intel_mid_dev_end = .; + } +#endif + /* * start address and size of operations which during runtime * can be patched with virtualization friendly instructions or diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b014d9414d08..040681928e9d 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); #endif + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif |