From 72c930dcfc2b49404ee9e20f6c868402e9c71166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 18 Sep 2015 17:54:29 +0200 Subject: x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. The purpose of that flags was to start counting system time from 0 when the KVM clock has been initialized. We can achieve the same by selecting one read as the initial point. A simple subtraction will work unless the KVM clock count overflows earlier (has smaller width) than scheduler's cycle count. We should be safe till x86_128. Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might regress on older ones. I presume we don't need to change kvm_clock_read instead of introducing kvm_sched_clock_read. A problem could arise in case sched_clock is expected to return the same value as get_cycles, but we should have merged those clocks in that case. Signed-off-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..2bd81e302427 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.3 From d81056b5278c520f1dede99bd59c14366ac8b1e1 Mon Sep 17 00:00:00 2001 From: Lukasz Anaczkowski Date: Wed, 9 Sep 2015 15:47:29 +0200 Subject: x86, ACPI: Handle apic/x2apic entries in MADT in correct order ACPI specifies the following rules when listing APIC IDs: (1) Boot processor is listed first (2) For multi-threaded processors, BIOS should list the first logical processor of each of the individual multi-threaded processors in MADT before listing any of the second logical processors. (3) APIC IDs < 0xFF should be listed in APIC subtable, APIC IDs >= 0xFF should be listed in X2APIC subtable Because of above, when there's more than 0xFF logical CPUs, BIOS interleaves APIC/X2APIC subtables. Assuming, there's 72 cores, 72 hyper-threads each, 288 CPUs total, listing is like this: APIC (0,4,8, .., 252) X2APIC (258,260,264, .. 284) APIC (1,5,9,...,253) X2APIC (259,261,265,...,285) APIC (2,6,10,...,254) X2APIC (260,262,266,..,286) APIC (3,7,11,...,251) X2APIC (255,261,262,266,..,287) Now, before this patch, due to how ACPI MADT subtables were parsed (BSP then X2APIC then APIC), kernel enumerated CPUs in reverted order (i.e. high APIC IDs were getting low logical IDs, and low APIC IDs were getting high logical IDs). This is wrong for the following reasons: () it's hard to predict how cores and threads are enumerated () when it's hard to predict, s/w threads cannot be properly affinitized causing significant performance impact due to e.g. inproper cache sharing () enumeration is inconsistent with how threads are enumerated on other Intel Xeon processors So, order in which MADT APIC/X2APIC handlers are passed is reverse and both handlers are passed to be called during same MADT table to walk to achieve correct CPU enumeration. In scenario when someone boots kernel with options 'maxcpus=72 nox2apic', in result less cores may be booted, since some of the CPUs the kernel will try to use will have APIC ID >= 0xFF. In such case, one should not pass 'nox2apic'. Disclimer: code parsing MADT APIC/X2APIC has not been touched since 2009, when X2APIC support was initially added. I do not know why MADT parsing code was added in the reversed order in the first place. I guess it didn't matter at that time since nobody cared about cores with APIC IDs >= 0xFF, right? This patch is based on work of "Yinghai Lu " previously published at https://lkml.org/lkml/2013/1/21/563 Here's the explanation why parsing interface needs to be changed and why simpler approach will not work https://lkml.org/lkml/2015/9/7/285 Signed-off-by: Lukasz Anaczkowski Acked-by: Thomas Gleixner (commit message) Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ded848c20e05..e75907601a41 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void) { int count; int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; if (!cpu_has_apic) return -ENODEV; @@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + x2count = madt_proc[0].count; + count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); -- cgit v1.2.3 From 883a1e867e0fe7c2dc2e5844ef692f80177631d5 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Thu, 17 Sep 2015 00:19:42 +0800 Subject: ftrace: Calculate the correct dyn_ftrace number to report to the userspace Now, ftrace only calculate the dyn_ftrace number in the adding breakpoint loop, not in adding update and finish update loop. Calculate the correct dyn_ftrace, once ftrace reports the failure message to the userspace. Link: http://lkml.kernel.org/r/1442420382-13130-1-git-send-email-mnfhuang@gmail.com Signed-off-by: Minfei Huang Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8b7b0a51e742..311bcf338f07 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -556,6 +556,7 @@ void ftrace_replace_code(int enable) run_sync(); report = "updating code"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -563,11 +564,13 @@ void ftrace_replace_code(int enable) ret = add_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); report = "removing breakpoints"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -575,6 +578,7 @@ void ftrace_replace_code(int enable) ret = finish_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); -- cgit v1.2.3 From e2391a2dcaca41c6de0fbc25329969fc7142f20d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 5 Nov 2015 15:18:03 -0600 Subject: livepatch: Fix crash with !CONFIG_DEBUG_SET_MODULE_RONX When loading a patch module on a kernel with !CONFIG_DEBUG_SET_MODULE_RONX, the following crash occurs: [ 205.988776] livepatch: enabling patch 'kpatch_meminfo_string' [ 205.989829] BUG: unable to handle kernel paging request at ffffffffa08d2fc0 [ 205.989863] IP: [] do_init_module+0x8c/0x1ba [ 205.989888] PGD 1a10067 PUD 1a11063 PMD 7bcde067 PTE 3740e161 [ 205.989915] Oops: 0003 [#1] SMP [ 205.990187] CPU: 2 PID: 14570 Comm: insmod Tainted: G O K 4.1.12 [ 205.990214] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 [ 205.990249] task: ffff8800374aaa90 ti: ffff8800794b8000 task.ti: ffff8800794b8000 [ 205.990276] RIP: 0010:[] [] do_init_module+0x8c/0x1ba [ 205.990307] RSP: 0018:ffff8800794bbd58 EFLAGS: 00010246 [ 205.990327] RAX: 0000000000000000 RBX: ffffffffa08d2fc0 RCX: 0000000000000000 [ 205.990356] RDX: 01ffff8000000080 RSI: 0000000000000000 RDI: ffffffff81a54b40 [ 205.990382] RBP: ffff88007b4c4d80 R08: 0000000000000007 R09: 0000000000000000 [ 205.990408] R10: 0000000000000008 R11: ffffea0001f18840 R12: 0000000000000000 [ 205.990433] R13: 0000000000000001 R14: ffffffffa08d2fc0 R15: ffff88007bd0bc40 [ 205.990459] FS: 00007f1128fbc700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 [ 205.990488] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 205.990509] CR2: ffffffffa08d2fc0 CR3: 000000002606e000 CR4: 00000000001406e0 [ 205.990536] Stack: [ 205.990545] ffff8800794bbec8 0000000000000001 ffffffffa08d3010 ffffffff810ecea9 [ 205.990576] ffffffff810e8e40 000000000005f360 ffff88007bd0bc50 ffffffffa08d3240 [ 205.990608] ffffffffa08d52c0 ffffffffa08d3210 ffff8800794bbed8 ffff8800794bbf1c [ 205.990639] Call Trace: [ 205.990651] [] ? load_module+0x1e59/0x23a0 [ 205.990672] [] ? store_uevent+0x40/0x40 [ 205.990693] [] ? copy_module_from_fd.isra.49+0xb5/0x140 [ 205.990718] [] ? SyS_finit_module+0x7d/0xa0 [ 205.990741] [] ? system_call_fastpath+0x16/0x75 [ 205.990763] Code: f9 00 00 00 74 23 49 c7 c0 92 e1 60 81 48 8d 53 18 89 c1 4c 89 c6 48 c7 c7 f0 85 7d 81 31 c0 e8 71 fa ff ff e8 58 0e 00 00 31 f6 03 00 00 00 00 48 89 da 48 c7 c7 20 c7 a5 81 e8 d0 ec b3 ff [ 205.990916] RIP [] do_init_module+0x8c/0x1ba [ 205.990940] RSP [ 205.990953] CR2: ffffffffa08d2fc0 With !CONFIG_DEBUG_SET_MODULE_RONX, module text and rodata pages are writable, and the debug_align() macro allows the module struct to share a page with executable text. When klp_write_module_reloc() calls set_memory_ro() on the page, it effectively turns the module struct into a read-only structure, resulting in a page fault when load_module() does "mod->state = MODULE_STATE_LIVE". Reported-by: Cyril B. Tested-by: Cyril B. Signed-off-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/kernel/livepatch.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index ff3c3101d003..d1d35ccffed3 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -42,7 +42,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, bool readonly; unsigned long val; unsigned long core = (unsigned long)mod->module_core; - unsigned long core_ro_size = mod->core_ro_size; unsigned long core_size = mod->core_size; switch (type) { @@ -70,10 +69,12 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, /* loc does not point to any symbol inside the module */ return -EINVAL; - if (loc < core + core_ro_size) + readonly = false; + +#ifdef CONFIG_DEBUG_SET_MODULE_RONX + if (loc < core + mod->core_ro_size) readonly = true; - else - readonly = false; +#endif /* determine if the relocation spans a page boundary */ numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; -- cgit v1.2.3 From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); -- cgit v1.2.3 From 78e3c7951021b4e1a554b3d619506b55b0619073 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:08 -0800 Subject: arch/x86/kernel/cpu/perf_event_msr.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_msr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13934f2..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } -- cgit v1.2.3 From 354dbaa7ff5b53a0ed1c0f7a9773d5953b3a1bb9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Oct 2015 18:56:26 +0300 Subject: x86/cpu/intel: Enable X86_FEATURE_NONSTOP_TSC_S3 for Merrifield The Intel Merrifield SoC is a successor of the Intel MID line of SoCs. Let's set the neccessary capability for that chip. See commit c54fdbb2823d (x86: Add cpu capability flag X86_FEATURE_NONSTOP_TSC_S3) for the details. Signed-off-by: Andy Shevchenko Link: http://lkml.kernel.org/r/1444319786-36125-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 98a13db5f4be..209ac1e7d1f0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x27: /* Penwell */ case 0x35: /* Cloverview */ + case 0x4a: /* Merrifield */ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: -- cgit v1.2.3 From 8c058b0b9c34d8c8d7912880956543769323e2d8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 3 Nov 2015 10:40:14 +0100 Subject: x86/irq: Probe for PIC presence before allocating descs for legacy IRQs Commit d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") brought a regression for Hyper-V Gen2 instances. These instances don't have i8259 legacy PIC but they use legacy IRQs for serial port, rtc, and acpi. With this commit included we end up with these IRQs not initialized. Earlier, there was a special workaround for legacy IRQs in mp_map_pin_to_irq() doing mp_irqdomain_map() without looking at nr_legacy_irqs() and now we fail in __irq_domain_alloc_irqs() when irq_domain_alloc_descs() returns -EEXIST. The essence of the issue seems to be that early_irq_init() calls arch_probe_nr_irqs() to figure out the number of legacy IRQs before we probe for i8259 and gets 16. Later when init_8259A() is called we switch to NULL legacy PIC and nr_legacy_irqs() starts to return 0 but we already have 16 descs allocated. Solve the issue by separating i8259 probe from init and calling it in arch_probe_nr_irqs() before we actually use nr_legacy_irqs() information. Fixes: d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") Signed-off-by: Vitaly Kuznetsov Cc: Jiang Liu Cc: K. Y. Srinivasan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446543614-3621-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 6 +++++- arch/x86/kernel/i8259.c | 29 +++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 836d11b92811..861bc59c8f25 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -361,7 +361,11 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return nr_legacy_irqs(); + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 16cb827a5b27..be22f5a2192e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -295,16 +295,11 @@ static void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void init_8259A(int auto_eoi) +static int probe_8259A(void) { unsigned long flags; unsigned char probe_val = ~(1 << PIC_CASCADE_IR); unsigned char new_val; - - i8259A_auto_eoi = auto_eoi; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - /* * Check to see if we have a PIC. * Mask all except the cascade and read @@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi) * have a PIC, we will read 0xff as opposed to the * value we wrote. */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return; } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* @@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } +static int legacy_pic_probe(void) +{ + return 0; +} struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, @@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = { .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; @@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = { .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, + .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; -- cgit v1.2.3 From 3849e91f571dcb48cf2c8143480c59137d44d6bc Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 4 Nov 2015 12:49:42 +0100 Subject: x86/AMD: Fix last level cache topology for AMD Fam17h systems On AMD Fam17h systems, the last level cache is not resident in the northbridge. Therefore, we cannot assign cpu_llc_id to the same value as Node ID as we have been doing until now. We should rather look at the ApicID bits of the core to provide us the last level cache ID info. Signed-off-by: Aravind Gopalakrishnan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Huang Rui Cc: Ingo Molnar Cc: Jacob Shin Link: http://lkml.kernel.org/r/1446582899-9378-1-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4a70fc6d400a..a8816b325162 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); + unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); + + /* + * Fix percpu cpu_llc_id here as LLC topology is different + * for Fam17h systems. + */ + if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) + return; + + socket_id = (c->apicid >> bits) - 1; + core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; + + per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } -- cgit v1.2.3 From 68accac392d859d24adcf1be3a90e41f978bd54c Mon Sep 17 00:00:00 2001 From: Krzysztof Mazur Date: Fri, 6 Nov 2015 14:18:36 +0100 Subject: x86/setup: Fix low identity map for >= 2GB kernel range The commit f5f3497cad8c extended the low identity mapping. However, if the kernel uses more than 2 GB (VMSPLIT_2G_OPT or VMSPLIT_1G memory split), the normal memory mapping is overwritten by the low identity mapping causing a crash. To avoid overwritting, limit the low identity map to cover only memory before kernel range (PAGE_OFFSET). Fixes: f5f3497cad8c "x86/setup: Extend low identity map to cover whole kernel range Signed-off-by: Krzysztof Mazur Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Laszlo Ersek Cc: Matt Fleming Cc: Paolo Bonzini Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446815916-22105-1-git-send-email-krzysiek@podlesie.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a1e4da98c8f0..29db25f9a745 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1188,7 +1188,7 @@ void __init setup_arch(char **cmdline_p) */ clone_pgd_range(initial_page_table, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); #endif tboot_probe(); -- cgit v1.2.3 From 04633df0c43d710e5f696b06539c100898678235 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 5 Nov 2015 16:57:56 +0100 Subject: x86/cpu: Call verify_cpu() after having entered long mode too When we get loaded by a 64-bit bootloader, kernel entry point is startup_64 in head_64.S. We don't trust any and all bootloaders because some will fiddle with CPU configuration so we go ahead and massage each CPU into sanity again. For example, some dell BIOSes have this XD disable feature which set IA32_MISC_ENABLE[34] and disable NX. This might be some dumb workaround for other OSes but Linux sure doesn't need it. A similar thing is present in the Surface 3 firmware - see https://bugzilla.kernel.org/show_bug.cgi?id=106051 - which sets this bit only on the BSP: # rdmsr -a 0x1a0 400850089 850089 850089 850089 I know, right?! There's not even an off switch in there. So fix all those cases by sanitizing the 64-bit entry point too. For that, make verify_cpu() callable in 64-bit mode also. Requested-and-debugged-by: "H. Peter Anvin" Reported-and-tested-by: Bastien Nocera Signed-off-by: Borislav Petkov Cc: Matt Fleming Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446739076-21303-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head_64.S | 8 ++++++++ arch/x86/kernel/verify_cpu.S | 12 +++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8a73f2..ffdc0e860390 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,9 @@ startup_64: * tables and then reload them. */ + /* Sanitize CPU configuration */ + call verify_cpu + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -174,6 +177,9 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + /* Sanitize CPU configuration */ + call verify_cpu + movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: @@ -288,6 +294,8 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#include "verify_cpu.S" + #ifdef CONFIG_HOTPLUG_CPU /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index b9242bacbe59..4cf401f581e7 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -34,10 +34,11 @@ #include verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +#ifndef __x86_64__ pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -48,6 +49,7 @@ verify_cpu: popl %eax cmpl %eax,%ebx jz verify_cpu_no_longmode # cpu has no cpuid +#endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid @@ -130,10 +132,10 @@ verify_cpu_sse_test: jmp verify_cpu_sse_test # try again verify_cpu_no_longmode: - popfl # Restore caller passed flags + popf # Restore caller passed flags movl $1,%eax ret verify_cpu_sse_ok: - popfl # Restore caller passed flags + popf # Restore caller passed flags xorl %eax, %eax ret -- cgit v1.2.3 From ab6b52947545a5355154f64f449f97af9d05845f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Nov 2015 16:23:54 -0800 Subject: x86/fpu: Fix 32-bit signal frame handling (This should have gone to LKML originally. Sorry for the extra noise, folks on the cc.) Background: Signal frames on x86 have two formats: 1. For 32-bit executables (whether on a real 32-bit kernel or under 32-bit emulation on a 64-bit kernel) we have a 'fpregset_t' that includes the "FSAVE" registers. 2. For 64-bit executables (on 64-bit kernels obviously), the 'fpregset_t' is smaller and does not contain the "FSAVE" state. When creating the signal frame, we have to be aware of whether we are running a 32 or 64-bit executable so we create the correct format signal frame. Problem: save_xstate_epilog() uses 'fx_sw_reserved_ia32' whenever it is called for a 32-bit executable. This is for real 32-bit and ia32 emulation. But, fpu__init_prepare_fx_sw_frame() only initializes 'fx_sw_reserved_ia32' when emulation is enabled, *NOT* for real 32-bit kernels. This leads to really wierd situations where 32-bit programs lose their extended state when returning from a signal handler. The kernel copies the uninitialized (zero) 'fx_sw_reserved_ia32' out to userspace in save_xstate_epilog(). But when returning from the signal, the kernel errors out in check_for_xstate() when it does not see FP_XSTATE_MAGIC1 present (because it was zeroed). This leads to the FPU/XSAVE state being initialized. For MPX, this leads to the most permissive state and means we silently lose bounds violations. I think this would also mean that we could lose *ANY* FPU/SSE/AVX state. I'm not sure why no one has spotted this bug. I believe this was broken by: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") way back in 2012. Signed-off-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151111002354.A0799571@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ef29b742cea7..31c6a60505e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } -- cgit v1.2.3 From a05917b6ba9dc9a95fc42bdcbe3a875e8ad83935 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Fri, 6 Nov 2015 17:00:23 +0800 Subject: x86/fpu: Fix get_xsave_addr() behavior under virtualization KVM uses the get_xsave_addr() function in a different fashion from the native kernel, in that the 'xsave' parameter belongs to guest vcpu, not the currently running task. But 'xsave' is replaced with current task's (host) xsave structure, so get_xsave_addr() will incorrectly return the bad xsave address to KVM. Fix it so that the passed in 'xsave' address is used - as intended originally. Signed-off-by: Huaitong Han Reviewed-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Paolo Bonzini Cc: Borislav Petkov Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/1446800423-21622-1-git-send-email-huaitong.han@intel.com [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6454f2731b56..70fc312221fc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is -- cgit v1.2.3 From 41ac18ebfc429ce3f4d369ef07447d652999a0cd Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 4 Nov 2015 17:43:53 +0800 Subject: perf/x86/intel/rapl: Remove the unused RAPL_EVENT_DESC() macro Signed-off-by: Huang Rui Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dasaratharaman Chandramouli Cc: Fengguang Wu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Li Link: http://lkml.kernel.org/r/1446630233-3166-1-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 81431c0f0614..ed446bdcbf31 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -107,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __rapl_##_var##_show, NULL) -#define RAPL_EVENT_DESC(_name, _config) \ -{ \ - .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ - .config = _config, \ -} - #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ #define RAPL_EVENT_ATTR_STR(_name, v, str) \ -- cgit v1.2.3 From 112677d683d31ebd6a8e8b02e0620ae512354b2d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 17 Nov 2015 09:43:24 +0900 Subject: x86/ftrace: Add comment on static function tracing There was a confusion between update_ftrace_function() and static function tracing trampoline regarding 3rd parameter (ftrace_ops). Add a comment for clarification. Suggested-by: Steven Rostedt Signed-off-by: Namhyung Kim Cc: H. Peter Anvin Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1447721004-2551-1-git-send-email-namhyung@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mcount_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 94ea120fa21f..87e1762e2bca 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -278,6 +278,12 @@ trace: /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ call *ftrace_trace_function restore_mcount_regs -- cgit v1.2.3 From 581b7f158fe0383b492acd1ce3fb4e99d4e57808 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 3 Jun 2015 10:31:14 +0100 Subject: x86/cpu: Fix SMAP check in PVOPS environments There appears to be no formal statement of what pv_irq_ops.save_fl() is supposed to return precisely. Native returns the full flags, while lguest and Xen only return the Interrupt Flag, and both have comments by the implementations stating that only the Interrupt Flag is looked at. This may have been true when initially implemented, but no longer is. To make matters worse, the Xen PVOP leaves the upper bits undefined, making the BUG_ON() undefined behaviour. Experimentally, this now trips for 32bit PV guests on Broadwell hardware. The BUG_ON() is consistent for an individual build, but not consistent for all builds. It has also been a sitting timebomb since SMAP support was introduced. Use native_save_fl() instead, which will obtain an accurate view of the AC flag. Signed-off-by: Andrew Cooper Reviewed-by: David Vrabel Tested-by: Rusty Russell Cc: Rusty Russell Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: Cc: Xen-devel CC: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1433323874-6927-1-git-send-email-andrew.cooper3@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4ddd780aeac9..c2b7522cbf35 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { -- cgit v1.2.3 From 614e4c4ebc75517295bccd29b20ddbc5b52af6fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:04 +0100 Subject: perf/core: Robustify the perf_cgroup_from_task() RCU checks This patch reinforces the lockdep checks performed by perf_cgroup_from_tsk() by passing the perf_event_context whenever possible. It is okay to not hold the RCU read lock when we know we hold the ctx->lock. This patch makes sure this property holds. In some functions, such as perf_cgroup_sched_in(), we do not pass the context because we are sure we are holding the RCU read lock. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } -- cgit v1.2.3 From b28ae9560b693bcd2e9f4d6d9c415d5380b7c3c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Oct 2015 11:46:33 -0700 Subject: perf/x86: Fix LBR call stack save/restore This fixes a bug I added in the following commit: 90405aa02247 ("perf/x86/intel/lbr: Limit LBR accesses to TOS in callstack mode") The bug could lead to lost LBR call stacks. When restoring the LBR state we need to use the TOS of the previous context, not the current context. To do that we need to save/restore the TOS. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1445366797-30894-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 499f533dd3cc..ffa7a92025e1 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -627,6 +627,7 @@ struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; + int tos; int lbr_callstack_users; int lbr_stack_state; }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index bfd0b717e944..659f01e165d5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -239,7 +239,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } mask = x86_pmu.lbr_nr - 1; - tos = intel_pmu_lbr_tos(); + tos = task_ctx->tos; for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); @@ -247,6 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } @@ -270,6 +271,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; } -- cgit v1.2.3 From 90eec103b96e30401c0b846045bf8a1c7159b6da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:08:45 +0100 Subject: treewide: Remove old email address There were still a number of references to my old Red Hat email address in the kernel source. Remove these while keeping the Red Hat copyright notices intact. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/irq_work.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf070c27..2bf79d7c97df 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ffa7a92025e1..ab18b8a91583 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include -- cgit v1.2.3 From 2d5be37d686c4dae8e60d20283d6f44ac2c44f65 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Nov 2015 12:24:00 +0100 Subject: x86/microcode: Initialize the driver late when facilities are up Running microcode_init() from setup_arch() is a bad idea because not even kmalloc() is ready at that point and the loader does all kinds of allocations and init/registration with various subsystems. Make it a late initcall when required facilities are initialized so that the microcode driver initialization can succeed too. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151120112400.GC4028@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 1 + arch/x86/kernel/setup.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7fc27f1cca58..b3e94ef461fd 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -698,3 +698,4 @@ int __init microcode_init(void) return error; } +late_initcall(microcode_init); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 29db25f9a745..d2bbe343fda7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1250,8 +1250,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif - - microcode_init(); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 656279a1f3b210cf48ccc572fd7c6b8e2250be77 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 22 Nov 2015 18:16:15 -0500 Subject: x86 smpboot: Re-enable init_udelay=0 by default on modern CPUs commit f1ccd249319e allowed the cmdline "cpu_init_udelay=" to work with all values, including the default of 10000. But in setting the default of 10000, it over-rode the code that sets the delay 0 on modern processors. Also, tidy up use of INT/UINT. Fixes: f1ccd249319e "x86/smpboot: Fix cpu_init_udelay=10000 corner case boot parameter misbehavior" Reported-by: Shane Signed-off-by: Len Brown Cc: dparsons@brightdsl.net Cc: stable@kernel.org Link: http://lkml.kernel.org/r/9082eb809ef40dad02db714759c7aaf618c518d4.1448232494.git.len.brown@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5ecbc..fbabe4fcc7fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } -- cgit v1.2.3 From bc0d0d093b379b0b379c429e3348498287c8a9ca Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Nov 2015 09:10:33 -0800 Subject: libnvdimm, e820: skip module loading when no type-12 If there are no persistent memory ranges present then don't bother creating the platform device. Otherwise, it loads the full libnvdimm sub-system only to discover no resources present. Reported-by: Andy Lutomirski Acked-by: Andy Lutomirski Signed-off-by: Dan Williams --- arch/x86/kernel/pmem.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 4f00b63d7ff3..14415aff1813 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -4,10 +4,22 @@ */ #include #include +#include + +static int found(u64 start, u64 end, void *data) +{ + return 1; +} static __init int register_e820_pmem(void) { + char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; + int rc; + + rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; /* * See drivers/nvdimm/e820.c for the implementation, this is -- cgit v1.2.3 From 22eab1108781eff09961ae7001704f7bd8fb1dce Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 1 Dec 2015 00:54:36 +0300 Subject: x86/signal: Fix restart_syscall number for x32 tasks When restarting a syscall with regs->ax == -ERESTART_RESTARTBLOCK, regs->ax is assigned to a restart_syscall number. For x32 tasks, this syscall number must have __X32_SYSCALL_BIT set, otherwise it will be an x86_64 syscall number instead of a valid x32 syscall number. This issue has been there since the introduction of x32. Reported-by: strace/tests/restart_syscall.test Reported-and-tested-by: Elvira Khabirova Signed-off-by: Dmitry V. Levin Cc: Elvira Khabirova Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151130215436.GA25996@altlinux.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/signal.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b7ffb7c00075..cb6282c3638f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -690,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(failed, ksig, stepping); } -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) + return __NR_restart_syscall; +#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ + return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : + __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +} /* * Note that 'init' is a special process: it doesn't get signals it doesn't @@ -724,7 +727,7 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; + regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } -- cgit v1.2.3 From e0fbac1cd49c5ca6cb6a0aa918d0943ce95d906c Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Tue, 24 Nov 2015 12:05:01 -0500 Subject: perf/x86/intel: Make L1D_PEND_MISS.FB_FULL not constrained on Haswell There was a mistake in the Haswell constraints table. Signed-off-by: Yuanfang Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1448384701-9110-1-git-send-email-cheny@udel.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f63360be2238..e2a430021e46 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -232,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ - INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ -- cgit v1.2.3 From 169b932a15318e8e9f2f0f12eeb55dda09c8737f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 9 Nov 2015 10:24:31 +0100 Subject: perf/x86/intel: Fix INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA macro We need to add rest of the flags to the constraint mask instead of another INTEL_ARCH_EVENT_MASK, fixing a typo. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1447061071-28085-1-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ab18b8a91583..d0e35ebb2adb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -387,7 +387,7 @@ struct cpu_hw_events { /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ - INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) -- cgit v1.2.3