From 749d088b8e7f4b9826ede02b9a043e417fa84aa1 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 27 May 2016 14:17:10 +0800 Subject: pvclock: Add CPU barriers to get correct version value Protocol for the "version" fields is: hypervisor raises it (making it uneven) before it starts updating the fields and raises it again (making it even) when it is done. Thus the guest can make sure the time values it got are consistent by checking the version before and after reading them. Add CPU barries after getting version value just like what function vread_pvclock does, because all of callees in this function is inline. Fixes: 502dfeff239e8313bfbe906ca0a1a6827ac8481b Cc: stable@vger.kernel.org Signed-off-by: Minfei Huang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock.h | 2 ++ arch/x86/kernel/pvclock.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index fdcc04020636..538ae944855e 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -85,6 +85,8 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; + /* Make the latest version visible */ + smp_rmb(); offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 99bfc025111d..7f82fe0a6807 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -66,6 +66,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); return flags & valid_flags; @@ -80,6 +82,8 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = __pvclock_read_cycles(src, &ret, &flags); + /* Make sure that the version double-check is last. */ + smp_rmb(); } while ((src->version & 1) || version != src->version); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { -- cgit v1.2.3 From f7550d076d55c1b5f02f95012e3a5a84d264c47d Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 27 May 2016 14:17:11 +0800 Subject: pvclock: Cleanup to remove function pvclock_get_nsec_offset Function __pvclock_read_cycles is short enough, so there is no need to have another function pvclock_get_nsec_offset to calculate tsc delta. It's better to combine it into function __pvclock_read_cycles. Remove useless variables in function __pvclock_read_cycles. Signed-off-by: Minfei Huang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 538ae944855e..7c1c89598688 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -68,32 +68,23 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) return product; } -static __always_inline -u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) -{ - u64 delta = rdtsc_ordered() - src->tsc_timestamp; - return pvclock_scale_delta(delta, src->tsc_to_system_mul, - src->tsc_shift); -} - static __always_inline unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, cycle_t *cycles, u8 *flags) { unsigned version; - cycle_t ret, offset; - u8 ret_flags; + cycle_t offset; + u64 delta; version = src->version; /* Make the latest version visible */ smp_rmb(); - offset = pvclock_get_nsec_offset(src); - ret = src->system_time + offset; - ret_flags = src->flags; - - *cycles = ret; - *flags = ret_flags; + delta = rdtsc_ordered() - src->tsc_timestamp; + offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); + *cycles = src->system_time + offset; + *flags = src->flags; return version; } -- cgit v1.2.3 From ed911b43adb889c37a37fa57a995f0b460c633b6 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Sat, 28 May 2016 20:27:43 +0800 Subject: pvclock: Get rid of __pvclock_read_cycles in function pvclock_read_flags There is a generic function __pvclock_read_cycles to be used to get both flags and cycles. For function pvclock_read_flags, it's useless to get cycles value. To make this function be more effective, get this variable flags directly in function. Signed-off-by: Minfei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kernel/pvclock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 7f82fe0a6807..06c58ce46762 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -61,11 +61,14 @@ void pvclock_resume(void) u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); + version = src->version; + /* Make the latest version visible */ + smp_rmb(); + + flags = src->flags; /* Make sure that the version double-check is last. */ smp_rmb(); } while ((src->version & 1) || version != src->version); -- cgit v1.2.3 From 8d93c874ac899bfdf0ad3787baef684a0c878c2c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 20 Jun 2016 22:28:02 -0300 Subject: KVM: x86: move nsec_to_cycles from x86.c to x86.h Move the inline function nsec_to_cycles from x86.c to x86.h, as the next patch uses it from lapic.c. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ------ arch/x86/kvm/x86.h | 7 +++++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902d9da12392..7da5dd2057a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1244,12 +1244,6 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} - static u32 adjust_tsc_khz(u32 khz, s32 ppm) { u64 v = (u64)khz * (1000000 + ppm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7ce3634ab5fe..a82ca466b62e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include +#include #include "kvm_cache_regs.h" #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL @@ -195,6 +196,12 @@ extern unsigned int lapic_timer_advance_ns; extern struct static_key kvm_no_apic_vcpu; +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +{ + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); +} + /* Same "calling convention" as do_div: * - divide (n << 32) by base * - put result in n -- cgit v1.2.3 From b606f189c7d5bf9b875bba168162fe05287880fe Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 20 Jun 2016 22:33:48 -0300 Subject: KVM: LAPIC: cap __delay at lapic_timer_advance_ns The host timer which emulates the guest LAPIC TSC deadline timer has its expiration diminished by lapic_timer_advance_ns nanoseconds. Therefore if, at wait_lapic_expire, a difference larger than lapic_timer_advance_ns is encountered, delay at most lapic_timer_advance_ns. This fixes a problem where the guest can cause the host to delay for large amounts of time. Reported-by: Alan Jenkins Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bbb5b283ff63..a397200281c1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1310,7 +1310,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(tsc_deadline - guest_tsc); + __delay(min(tsc_deadline - guest_tsc, + nsec_to_cycles(vcpu, lapic_timer_advance_ns))); } static void start_apic_timer(struct kvm_lapic *apic) -- cgit v1.2.3 From ff30ef40deca4658e27b0c596e7baf39115e858f Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Sat, 18 Jun 2016 11:01:05 +0200 Subject: KVM: nVMX: VMX instructions: fix segment checks when L1 is in long mode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I couldn't get Xen to boot a L2 HVM when it was nested under KVM - it was getting a GP(0) on a rather unspecial vmread from Xen: (XEN) ----[ Xen-4.7.0-rc x86_64 debug=n Not tainted ]---- (XEN) CPU: 1 (XEN) RIP: e008:[] vmx_get_segment_register+0x14e/0x450 (XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor (d1v0) (XEN) rax: ffff82d0801e6288 rbx: ffff83003ffbfb7c rcx: fffffffffffab928 (XEN) rdx: 0000000000000000 rsi: 0000000000000000 rdi: ffff83000bdd0000 (XEN) rbp: ffff83000bdd0000 rsp: ffff83003ffbfab0 r8: ffff830038813910 (XEN) r9: ffff83003faf3958 r10: 0000000a3b9f7640 r11: ffff83003f82d418 (XEN) r12: 0000000000000000 r13: ffff83003ffbffff r14: 0000000000004802 (XEN) r15: 0000000000000008 cr0: 0000000080050033 cr4: 00000000001526e0 (XEN) cr3: 000000003fc79000 cr2: 0000000000000000 (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: e008 (XEN) Xen code around (vmx_get_segment_register+0x14e/0x450): (XEN) 00 00 41 be 02 48 00 00 <44> 0f 78 74 24 08 0f 86 38 56 00 00 b8 08 68 00 (XEN) Xen stack trace from rsp=ffff83003ffbfab0: ... (XEN) Xen call trace: (XEN) [] vmx_get_segment_register+0x14e/0x450 (XEN) [] get_page_from_gfn_p2m+0x165/0x300 (XEN) [] hvmemul_get_seg_reg+0x52/0x60 (XEN) [] hvm_emulate_prepare+0x53/0x70 (XEN) [] handle_mmio+0x2b/0xd0 (XEN) [] emulate.c#_hvm_emulate_one+0x111/0x2c0 (XEN) [] handle_hvm_io_completion+0x274/0x2a0 (XEN) [] __get_gfn_type_access+0xfa/0x270 (XEN) [] timer.c#add_entry+0x4b/0xb0 (XEN) [] timer.c#remove_entry+0x7c/0x90 (XEN) [] hvm_do_resume+0x23/0x140 (XEN) [] vmx_do_resume+0xa7/0x140 (XEN) [] context_switch+0x13b/0xe40 (XEN) [] schedule.c#schedule+0x22e/0x570 (XEN) [] softirq.c#__do_softirq+0x5c/0x90 (XEN) [] domain.c#idle_loop+0x25/0x50 (XEN) (XEN) (XEN) **************************************** (XEN) Panic on CPU 1: (XEN) GENERAL PROTECTION FAULT (XEN) [error_code=0000] (XEN) **************************************** Tracing my host KVM showed it was the one injecting the GP(0) when emulating the VMREAD and checking the destination segment permissions in get_vmx_mem_address(): 3) | vmx_handle_exit() { 3) | handle_vmread() { 3) | nested_vmx_check_permission() { 3) | vmx_get_segment() { 3) 0.074 us | vmx_read_guest_seg_base(); 3) 0.065 us | vmx_read_guest_seg_selector(); 3) 0.066 us | vmx_read_guest_seg_ar(); 3) 1.636 us | } 3) 0.058 us | vmx_get_rflags(); 3) 0.062 us | vmx_read_guest_seg_ar(); 3) 3.469 us | } 3) | vmx_get_cs_db_l_bits() { 3) 0.058 us | vmx_read_guest_seg_ar(); 3) 0.662 us | } 3) | get_vmx_mem_address() { 3) 0.068 us | vmx_cache_reg(); 3) | vmx_get_segment() { 3) 0.074 us | vmx_read_guest_seg_base(); 3) 0.068 us | vmx_read_guest_seg_selector(); 3) 0.071 us | vmx_read_guest_seg_ar(); 3) 1.756 us | } 3) | kvm_queue_exception_e() { 3) 0.066 us | kvm_multiple_exception(); 3) 0.684 us | } 3) 4.085 us | } 3) 9.833 us | } 3) + 10.366 us | } Cross-checking the KVM/VMX VMREAD emulation code with the Intel Software Developper Manual Volume 3C - "VMREAD - Read Field from Virtual-Machine Control Structure", I found that we're enforcing that the destination operand is NOT located in a read-only data segment or any code segment when the L1 is in long mode - BUT that check should only happen when it is in protected mode. Shuffling the code a bit to make our emulation follow the specification allows me to boot a Xen dom0 in a nested KVM and start HVM L2 guests without problems. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Signed-off-by: Quentin Casasnovas Cc: Eugene Korenevsky Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: linux-stable Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 003618e324ce..64a79f271276 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6671,7 +6671,13 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Checks for #GP/#SS exceptions. */ exn = false; - if (is_protmode(vcpu)) { + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is the only check on the memory + * destination for long mode! + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -6688,17 +6694,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, * execute-only code segment */ exn = ((s.type & 0xa) == 8); - } - if (exn) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - if (is_long_mode(vcpu)) { - /* Long mode: #GP(0)/#SS(0) if the memory address is in a - * non-canonical form. This is an only check for long mode. - */ - exn = is_noncanonical_address(*ret); - } else if (is_protmode(vcpu)) { + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); -- cgit v1.2.3 From 65c0554b73c920023cc8998802e508b798113b46 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 30 Jun 2016 18:11:41 +0200 Subject: x86/power/64: Fix kernel text mapping corruption during image restoration Logan Gunthorpe reports that hibernation stopped working reliably for him after commit ab76f7b4ab23 (x86/mm: Set NX on gap between __ex_table and rodata). That turns out to be a consequence of a long-standing issue with the 64-bit image restoration code on x86, which is that the temporary page tables set up by it to avoid page tables corruption when the last bits of the image kernel's memory contents are copied into their original page frames re-use the boot kernel's text mapping, but that mapping may very well get corrupted just like any other part of the page tables. Of course, if that happens, the final jump to the image kernel's entry point will go to nowhere. The exact reason why commit ab76f7b4ab23 matters here is that it sometimes causes a PMD of a large page to be split into PTEs that are allocated dynamically and get corrupted during image restoration as described above. To fix that issue note that the code copying the last bits of the image kernel's memory contents to the page frames occupied by them previoulsy doesn't use the kernel text mapping, because it runs from a special page covered by the identity mapping set up for that code from scratch. Hence, the kernel text mapping is only needed before that code starts to run and then it will only be used just for the final jump to the image kernel's entry point. Accordingly, the temporary page tables set up in swsusp_arch_resume() on x86-64 need to contain the kernel text mapping too. That mapping is only going to be used for the final jump to the image kernel, so it only needs to cover the image kernel's entry point, because the first thing the image kernel does after getting control back is to switch over to its own original page tables. Moreover, the virtual address of the image kernel's entry point in that mapping has to be the same as the one mapped by the image kernel's page tables. With that in mind, modify the x86-64's arch_hibernation_header_save() and arch_hibernation_header_restore() routines to pass the physical address of the image kernel's entry point (in addition to its virtual address) to the boot kernel (a small piece of assembly code involved in passing the entry point's virtual address to the image kernel is not necessary any more after that, so drop it). Update RESTORE_MAGIC too to reflect the image header format change. Next, in set_up_temporary_mappings(), use the physical and virtual addresses of the image kernel's entry point passed in the image header to set up a minimum kernel text mapping (using memory pages that won't be overwritten by the image kernel's memory contents) that will map those addresses to each other as appropriate. This makes the concern about the possible corruption of the original boot kernel text mapping go away and if the the minimum kernel text mapping used for the final jump marks the image kernel's entry point memory as executable, the jump to it is guaraneed to succeed. Fixes: ab76f7b4ab23 (x86/mm: Set NX on gap between __ex_table and rodata) Link: http://marc.info/?l=linux-pm&m=146372852823760&w=2 Reported-by: Logan Gunthorpe Reported-and-tested-by: Borislav Petkov Tested-by: Kees Cook Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_64.c | 97 ++++++++++++++++++++++++++++++++++----- arch/x86/power/hibernate_asm_64.S | 55 ++++++++++------------ 2 files changed, 109 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 009947d419a6..f2b5e6a5cf95 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Defined in hibernate_asm_64.S */ extern asmlinkage __visible int restore_image(void); @@ -28,6 +29,7 @@ extern asmlinkage __visible int restore_image(void); * kernel's text (this value is passed in the image header). */ unsigned long restore_jump_address __visible; +unsigned long jump_address_phys; /* * Value of the cr3 register from before the hibernation (this value is passed @@ -37,7 +39,43 @@ unsigned long restore_cr3 __visible; pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code __visible; +unsigned long relocated_restore_code __visible; + +static int set_up_temporary_text_mapping(void) +{ + pmd_t *pmd; + pud_t *pud; + + /* + * The new mapping only has to cover the page containing the image + * kernel's entry point (jump_address_phys), because the switch over to + * it is carried out by relocated code running from a page allocated + * specifically for this purpose and covered by the identity mapping, so + * the temporary kernel text mapping is only needed for the final jump. + * Moreover, in that mapping the virtual address of the image kernel's + * entry point must be the same as its virtual address in the image + * kernel (restore_jump_address), so the image kernel's + * restore_registers() code doesn't find itself in a different area of + * the virtual address space after switching over to the original page + * tables used by the image kernel. + */ + pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + + set_pmd(pmd + pmd_index(restore_jump_address), + __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); + set_pud(pud + pud_index(restore_jump_address), + __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pgd(temp_level4_pgt + pgd_index(restore_jump_address), + __pgd(__pa(pud) | _KERNPG_TABLE)); + + return 0; +} static void *alloc_pgt_page(void *context) { @@ -59,9 +97,10 @@ static int set_up_temporary_mappings(void) if (!temp_level4_pgt) return -ENOMEM; - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); + /* Prepare a temporary mapping for the kernel text */ + result = set_up_temporary_text_mapping(); + if (result) + return result; /* Set up the direct mapping from scratch */ for (i = 0; i < nr_pfn_mapped; i++) { @@ -78,19 +117,50 @@ static int set_up_temporary_mappings(void) return 0; } +static int relocate_restore_code(void) +{ + pgd_t *pgd; + pud_t *pud; + + relocated_restore_code = get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + + memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); + + /* Make the page containing the relocated code executable */ + pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); + pud = pud_offset(pgd, relocated_restore_code); + if (pud_large(*pud)) { + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); + } else { + pmd_t *pmd = pmd_offset(pud, relocated_restore_code); + + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + } else { + pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); + } + } + __flush_tlb_all(); + + return 0; +} + int swsusp_arch_resume(void) { int error; /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) + error = set_up_temporary_mappings(); + if (error) return error; - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); + error = relocate_restore_code(); + if (error) + return error; restore_image(); return 0; @@ -109,11 +179,12 @@ int pfn_is_nosave(unsigned long pfn) struct restore_data_record { unsigned long jump_address; + unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; }; -#define RESTORE_MAGIC 0x0123456789ABCDEFUL +#define RESTORE_MAGIC 0x123456789ABCDEF0UL /** * arch_hibernation_header_save - populate the architecture specific part @@ -126,7 +197,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; - rdr->jump_address = restore_jump_address; + rdr->jump_address = (unsigned long)&restore_registers; + rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; return 0; @@ -142,6 +214,7 @@ int arch_hibernation_header_restore(void *addr) struct restore_data_record *rdr = addr; restore_jump_address = rdr->jump_address; + jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 4400a43b9e28..3177c2bc26f6 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -44,9 +44,6 @@ ENTRY(swsusp_arch_suspend) pushfq popq pt_regs_flags(%rax) - /* save the address of restore_registers */ - movq $restore_registers, %rax - movq %rax, restore_jump_address(%rip) /* save cr3 */ movq %cr3, %rax movq %rax, restore_cr3(%rip) @@ -57,31 +54,34 @@ ENTRY(swsusp_arch_suspend) ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) - /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rdx - movq temp_level4_pgt(%rip), %rax - subq %rdx, %rax - movq %rax, %cr3 - /* Flush TLB */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(X86_CR4_PGE), %rdx - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3; - movq %rax, %cr4; # turn PGE back on - /* prepare to jump to the image kernel */ - movq restore_jump_address(%rip), %rax - movq restore_cr3(%rip), %rbx + movq restore_jump_address(%rip), %r8 + movq restore_cr3(%rip), %r9 + + /* prepare to switch to temporary page tables */ + movq temp_level4_pgt(%rip), %rax + movq mmu_cr4_features(%rip), %rbx /* prepare to copy image data to their original locations */ movq restore_pblist(%rip), %rdx + + /* jump to relocated restore code */ movq relocated_restore_code(%rip), %rcx jmpq *%rcx /* code below has been relocated to a safe page */ ENTRY(core_restore_code) + /* switch to temporary page tables */ + movq $__PAGE_OFFSET, %rcx + subq %rcx, %rax + movq %rax, %cr3 + /* flush TLB */ + movq %rbx, %rcx + andq $~(X86_CR4_PGE), %rcx + movq %rcx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3; + movq %rbx, %cr4; # turn PGE back on .Lloop: testq %rdx, %rdx jz .Ldone @@ -96,24 +96,17 @@ ENTRY(core_restore_code) /* progress to the next pbe */ movq pbe_next(%rdx), %rdx jmp .Lloop + .Ldone: /* jump to the restore_registers address from the image header */ - jmpq *%rax - /* - * NOTE: This assumes that the boot kernel's text mapping covers the - * image kernel's page containing restore_registers and the address of - * this page is the same as in the image kernel's text mapping (it - * should always be true, because the text mapping is linear, starting - * from 0, and is supposed to cover the entire kernel text for every - * kernel). - * - * code below belongs to the image kernel - */ + jmpq *%r8 + /* code below belongs to the image kernel */ + .align PAGE_SIZE ENTRY(restore_registers) FRAME_BEGIN /* go back to the original page tables */ - movq %rbx, %cr3 + movq %r9, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax -- cgit v1.2.3 From 1ead852dd88779eda12cb09cc894a03d9abfe1ec Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Jun 2016 19:13:49 +0200 Subject: x86/amd_nb: Fix boot crash on non-AMD systems Fix boot crash that triggers if this driver is built into a kernel and run on non-AMD systems. AMD northbridges users call amd_cache_northbridges() and it returns a negative value to signal that we weren't able to cache/detect any northbridges on the system. At least, it should do so as all its callers expect it to do so. But it does return a negative value only when kmalloc() fails. Fix it to return -ENODEV if there are no NBs cached as otherwise, amd_nb users like amd64_edac, for example, which relies on it to know whether it should load or not, gets loaded on systems like Intel Xeons where it shouldn't. Reported-and-tested-by: Tony Battersby Signed-off-by: Borislav Petkov Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1466097230-5333-2-git-send-email-bp@alien8.de Link: https://lkml.kernel.org/r/5761BEB0.9000807@cybernetics.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index a147e676fc7b..e991d5c8bb3a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -71,8 +71,8 @@ int amd_cache_northbridges(void) while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) i++; - if (i == 0) - return 0; + if (!i) + return -ENODEV; nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) -- cgit v1.2.3 From 487cf917ed0d12afaf403d9d77684bf44b8c13be Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Wed, 29 Jun 2016 04:27:36 -0400 Subject: Revert "ACPI, PCI, IRQ: remove redundant code in acpi_irq_penalty_init()" Trying to make the ISA and PCI init functionality common turned out to be a bad idea, because the ISA path depends on external functionality. Restore the previous behavior and limit the refactoring to PCI interrupts only. Fixes: 1fcb6a813c4f "ACPI,PCI,IRQ: remove redundant code in acpi_irq_penalty_init()" Signed-off-by: Sinan Kaya Tested-by: Wim Osterholt Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index b2a4e2a61f6b..3cd69832d7f4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -396,6 +396,7 @@ int __init pci_acpi_init(void) return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); + acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; x86_init.pci.init_irq = x86_init_noop; -- cgit v1.2.3 From 9010ae4a8dee29e5886e86682799dde0eee7f447 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 1 Jul 2016 15:22:22 -0700 Subject: perf/x86/intel: Update event constraints when HT is off This patch updates the event constraints for non-PEBS mode for Intel Broadwell and Skylake processors. When HT is off, each CPU gets 8 generic counters. However, not all events can be programmed on any of the 8 counters. This patch adds the constraints for the MEM_* events which can only be measured on the bottom 4 counters. The constraints are also valid when HT is off because, then, there are only 4 generic counters and they are the bottom counters. Signed-off-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1467411742-13245-1-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7c666958a625..9b4f9d3ce465 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -115,6 +115,10 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -139,6 +143,10 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -182,6 +190,16 @@ struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + + /* + * when HT is off, these can only run on the bottom 4 counters + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */ + EVENT_CONSTRAINT_END }; @@ -250,6 +268,10 @@ static struct event_constraint intel_hsw_event_constraints[] = { /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), + /* + * When HT is off these events can only run on the bottom 4 counters + * When HT is on, they are impacted by the HT bug and require EXCL access + */ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -264,6 +286,13 @@ struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ + /* + * when HT is off, these can only run on the bottom 4 counters + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */ EVENT_CONSTRAINT_END }; -- cgit v1.2.3 From fc18822510721fe694d273c5211c71ea52796d76 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 1 Jul 2016 23:02:05 -0500 Subject: perf/x86: Fix 32-bit perf user callgraph collection A basic perf callgraph record operation causes an immediate panic on a 32-bit kernel compiled with CONFIG_CC_STACKPROTECTOR=y: $ perf record -g ls Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: c0404fbd CPU: 0 PID: 998 Comm: ls Not tainted 4.7.0-rc5+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.1-1.fc24 04/01/2014 c0dd5967 ff7afe1c 00000086 f41dbc2c c07445a0 464c457f f41dbca8 f41dbc44 c05646f4 f41dbca8 464c457f f41dbca8 464c457f f41dbc54 c04625be c0ce56fc c0404fbd f41dbc88 c0404fbd b74668f0 f41dc000 00000000 c0000000 00000000 Call Trace: [] dump_stack+0x58/0x78 [] panic+0x8e/0x1c6 [] __stack_chk_fail+0x1e/0x30 [] ? perf_callchain_user+0x22d/0x230 [] perf_callchain_user+0x22d/0x230 [] get_perf_callchain+0x1ff/0x270 [] perf_callchain+0x78/0x90 [] perf_prepare_sample+0x24b/0x370 [] perf_event_output_forward+0x24/0x70 [] __perf_event_overflow+0xa0/0x210 [] ? cpu_clock_event_read+0x43/0x50 [] perf_swevent_hrtimer+0x101/0x180 [] ? kmap_atomic_prot+0x35/0x140 [] ? get_page_from_freelist+0x279/0x950 [] ? vma_interval_tree_remove+0x158/0x230 [] ? wp_page_copy.isra.82+0x2f4/0x630 [] ? page_add_file_rmap+0x1d/0x50 [] ? unlock_page+0x61/0x80 [] ? filemap_map_pages+0x305/0x320 [] ? handle_mm_fault+0xb7f/0x1560 [] ? timerqueue_del+0x1b/0x70 [] ? __remove_hrtimer+0x2e/0x60 [] __hrtimer_run_queues+0xcb/0x2a0 [] ? __perf_event_overflow+0x210/0x210 [] hrtimer_interrupt+0x8a/0x180 [] local_apic_timer_interrupt+0x32/0x60 [] smp_apic_timer_interrupt+0x33/0x50 [] apic_timer_interrupt+0x34/0x3c Kernel Offset: disabled ---[ end Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: c0404fbd The panic is caused by the fact that perf_callchain_user() mistakenly assumes it's 64-bit only and ends up corrupting the stack. Signed-off-by: Josh Poimboeuf Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.5+ Fixes: 75925e1ad7f5 ("perf/x86: Optimize stack walk user accesses") Link: http://lkml.kernel.org/r/1a547f5077ec30f75f9b57074837c3c80df86e5e.1467432113.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 33787ee817f0..26ced536005a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2319,7 +2319,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; - const void __user *fp; + const unsigned long __user *fp; if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2332,7 +2332,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; - fp = (void __user *)regs->bp; + fp = (unsigned long __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -2345,16 +2345,17 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs pagefault_disable(); while (entry->nr < entry->max_stack) { unsigned long bytes; + frame.next_frame = NULL; frame.return_address = 0; - if (!access_ok(VERIFY_READ, fp, 16)) + if (!access_ok(VERIFY_READ, fp, sizeof(*fp) * 2)) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8); + bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); if (bytes != 0) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8); + bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); if (bytes != 0) break; -- cgit v1.2.3 From 175a20c16fdb7700fcac63f1eeb2caa7e1dddd2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 23 Jun 2016 18:06:49 +0300 Subject: x86/perf/intel/rapl: Fix module name collision with powercap intel-rapl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 4b6e2571bf00 the rapl perf module calls itself intel-rapl. That name was already in use by the rapl powercap driver, which now fails to load if the perf module is loaded. Fix the problem by renaming the perf module to intel-rapl-perf, so that both modules can coexist. Fixes: 4b6e2571bf00 ("x86/perf/intel/rapl: Make the Intel RAPL PMU driver modular") Signed-off-by: Ville Syrjälä Cc: Vince Weaver Cc: Alexander Shishkin Cc: Kan Liang Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1466694409-3620-1-git-send-email-ville.syrjala@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3660b2cf245a..06c2baa51814 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,8 +1,8 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o -intel-rapl-objs := rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o +intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o -- cgit v1.2.3 From 8709ed4d4b0eab04561c1ec9e6ea50fd1e3897ff Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 17 Jun 2016 17:15:03 -0700 Subject: x86/cpu: Fix duplicated X86_BUG(9) macro cpufeatures.h currently defines X86_BUG(9) twice on 32-bit: #define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ ... #ifdef CONFIG_X86_32 #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif I think what happened was that this added the X86_BUG_ESPFIX, but in an #ifdef below most of the bugs: 58a5aac53313 x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled Then this came along and added X86_BUG_NULL_SEG, but collided with the earlier one that did the bug below the main block defining all the X86_BUG()s. 7a5d67048745 x86/cpu: Probe the behavior of nulling out a segment at boot time Signed-off-by: Dave Hansen Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160618001503.CEE1B141@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4a413485f9eb..c64b1e9c5d1a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -301,10 +301,6 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ -#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ - - #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional @@ -312,5 +308,7 @@ */ #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #endif /* _ASM_X86_CPUFEATURES_H */ -- cgit v1.2.3 From eb019503569c8c701f1e9c70e848d99c6680839b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 10 Jul 2016 19:14:01 +0200 Subject: perf/x86: Fix bogus kernel printk, again This showed up as "6Failed to access..." here. Signed-off-by: Vegard Nossum Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Chen Yucong Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1b74dde7c47c ("x86/cpu: Convert printk(KERN_ ...) to pr_(...)") Link: http://lkml.kernel.org/r/1468170841-17045-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 26ced536005a..91eac39625be 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -263,7 +263,7 @@ static bool check_hw_exists(void) msr_fail: pr_cont("Broken PMU hardware detected, using software events only.\n"); - pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n", + printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, reg, val_new); -- cgit v1.2.3 From 447d29d1d3aed839e74c2401ef63387780ac51ed Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Apply nvidia_bugs quirk only on root bus Since the following commit: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") ... early quirks are only applied to devices on the root bus. The motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to reintroduce scanning of secondary buses for a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs. To prevent regressions, open code the requirement to apply nvidia_bugs only on the root bus. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4d5477c1d76b2f0387a780f2142bbcdd9fee869b.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bca14c899137..256976fe2666 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -75,6 +75,13 @@ static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. -- cgit v1.2.3 From 850c321027c2e31d0afc71588974719a4b565550 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Reintroduce scanning of secondary buses We used to scan secondary buses until the following commit that was applied in 2009: 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") which commit constrained early quirks to the root bus only. Its motivation was to prevent application of the nvidia_bugs quirk on secondary buses. We're about to add a quirk to reset the Broadcom 4331 wireless card on 2011/2012 Macs, which is located on a secondary bus behind a PCIe root port. To facilitate that, reintroduce scanning of secondary buses. The commit message of 8659c406ade3 notes that scanning only the root bus "saves quite some unnecessary scanning work". The algorithm used prior to 8659c406ade3 was particularly time consuming because it scanned buses 0 to 31 brute force. To avoid lengthening boot time, employ a recursive strategy which only scans buses that are actually reachable from the root bus. Yinghai Lu pointed out that the secondary bus number read from a bridge's config space may be invalid, in particular a value of 0 would cause an infinite loop. The PCI core goes beyond that and recurses to a child bus only if its bus number is greater than the parent bus number (see pci_scan_bridge()). Since the root bus is numbered 0, this implies that secondary buses may not be 0. Do the same on early scanning. If this algorithm is found to significantly impact boot time or cause infinite loops on broken hardware, it would be possible to limit its recursion depth: The Broadcom 4331 quirk applies at depth 1, all others at depth 0, so the bus need not be scanned deeper than that for now. An alternative approach would be to revert to scanning only the root bus, and apply the Broadcom 4331 quirk to the root ports 8086:1c12, 8086:1e12 and 8086:1e16. Apple always positioned the card behind either of these three ports. The quirk would then check presence of the card in slot 0 below the root port and do its deed. Signed-off-by: Lukas Wunner Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/f0daa70dac1a9b2483abdb31887173eb6ab77bdf.1465690253.git.lukas@wunner.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 256976fe2666..ea60c05c2487 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -610,12 +610,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -648,6 +642,8 @@ static struct chipset early_qrk[] __initdata = { {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -656,7 +652,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -665,6 +661,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -692,25 +689,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} -- cgit v1.2.3 From abb2bafd295fe962bbadc329dbfb2146457283ac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 12 Jun 2016 12:31:53 +0200 Subject: x86/quirks: Add early quirk to reset Apple AirPort card MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EFI firmware on Macs contains a full-fledged network stack for downloading OS X images from osrecovery.apple.com. Unfortunately on Macs introduced 2011 and 2012, EFI brings up the Broadcom 4331 wireless card on every boot and leaves it enabled even after ExitBootServices has been called. The card continues to assert its IRQ line, causing spurious interrupts if the IRQ is shared. It also corrupts memory by DMAing received packets, allowing for remote code execution over the air. This only stops when a driver is loaded for the wireless card, which may be never if the driver is not installed or blacklisted. The issue seems to be constrained to the Broadcom 4331. Chris Milsted has verified that the newer Broadcom 4360 built into the MacBookPro11,3 (2013/2014) does not exhibit this behaviour. The chances that Apple will ever supply a firmware fix for the older machines appear to be zero. The solution is to reset the card on boot by writing to a reset bit in its mmio space. This must be done as an early quirk and not as a plain vanilla PCI quirk to successfully combat memory corruption by DMAed packets: Matthew Garrett found out in 2012 that the packets are written to EfiBootServicesData memory (http://mjg59.dreamwidth.org/11235.html). This type of memory is made available to the page allocator by efi_free_boot_services(). Plain vanilla PCI quirks run much later, in subsys initcall level. In-between a time window would be open for memory corruption. Random crashes occurring in this time window and attributed to DMAed packets have indeed been observed in the wild by Chris Bainbridge. When Matthew Garrett analyzed the memory corruption issue in 2012, he sought to fix it with a grub quirk which transitions the card to D3hot: http://git.savannah.gnu.org/cgit/grub.git/commit/?id=9d34bb85da56 This approach does not help users with other bootloaders and while it may prevent DMAed packets, it does not cure the spurious interrupts emanating from the card. Unfortunately the card's mmio space is inaccessible in D3hot, so to reset it, we have to undo the effect of Matthew's grub patch and transition the card back to D0. Note that the quirk takes a few shortcuts to reduce the amount of code: The size of BAR 0 and the location of the PM capability is identical on all affected machines and therefore hardcoded. Only the address of BAR 0 differs between models. Also, it is assumed that the BCMA core currently mapped is the 802.11 core. The EFI driver seems to always take care of this. Michael Büsch, Bjorn Helgaas and Matt Fleming contributed feedback towards finding the best solution to this problem. The following should be a comprehensive list of affected models: iMac13,1 2012 21.5" [Root Port 00:1c.3 = 8086:1e16] iMac13,2 2012 27" [Root Port 00:1c.3 = 8086:1e16] Macmini5,1 2011 i5 2.3 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,2 2011 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini5,3 2011 i7 2.0 GHz [Root Port 00:1c.1 = 8086:1c12] Macmini6,1 2012 i5 2.5 GHz [Root Port 00:1c.1 = 8086:1e12] Macmini6,2 2012 i7 2.3 GHz [Root Port 00:1c.1 = 8086:1e12] MacBookPro8,1 2011 13" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,2 2011 15" [Root Port 00:1c.1 = 8086:1c12] MacBookPro8,3 2011 17" [Root Port 00:1c.1 = 8086:1c12] MacBookPro9,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro9,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,1 2012 15" [Root Port 00:1c.1 = 8086:1e12] MacBookPro10,2 2012 13" [Root Port 00:1c.1 = 8086:1e12] For posterity, spurious interrupts caused by the Broadcom 4331 wireless card resulted in splats like this (stacktrace omitted): irq 17: nobody cared (try booting with the "irqpoll" option) handlers: [] pcie_isr [] sdhci_irq [sdhci] threaded [] sdhci_thread_irq [sdhci] [] azx_interrupt [snd_hda_codec] Disabling IRQ #17 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=79301 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111781 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=728916 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=895951#c16 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1009819 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1098621 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1149632#c5 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1279130 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1332732 Tested-by: Konstantin Simanov # [MacBookPro8,1] Tested-by: Lukas Wunner # [MacBookPro9,1] Tested-by: Bryan Paradis # [MacBookPro9,2] Tested-by: Andrew Worsley # [MacBookPro10,1] Tested-by: Chris Bainbridge # [MacBookPro10,2] Signed-off-by: Lukas Wunner Acked-by: Rafał Miłecki Acked-by: Matt Fleming Cc: Andy Lutomirski Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris Milsted Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Garrett Cc: Michael Buesch Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Cc: b43-dev@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: stable@vger.kernel.org Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Apply nvidia_bugs quirk only on root bus Cc: stable@vger.kernel.org # 123456789abc: x86/quirks: Reintroduce scanning of secondary buses Link: http://lkml.kernel.org/r/48d0972ac82a53d460e5fce77a07b2560db95203.1465690253.git.lukas@wunner.de [ Did minor readability edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ea60c05c2487..57b71373bae3 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include #include +#include +#include #include +#include +#include #include #include #include @@ -21,6 +25,9 @@ #include #include #include +#include + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -597,6 +604,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -639,6 +701,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; -- cgit v1.2.3 From 2ba78056acfe8d63a29565f91dae4678ed6b81ca Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Jul 2016 12:06:53 -0700 Subject: kasan: add newline to messages Currently GPF messages with KASAN look as follows: kasan: GPF could be caused by NULL-ptr deref or user memory accessgeneral protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Add newlines. Link: http://lkml.kernel.org/r/1467294357-98002-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kasan_init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1b1110fa0057..0493c17b8a51 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -54,8 +54,8 @@ static int kasan_die_handler(struct notifier_block *self, void *data) { if (val == DIE_GPF) { - pr_emerg("CONFIG_KASAN_INLINE enabled"); - pr_emerg("GPF could be caused by NULL-ptr deref or user memory access"); + pr_emerg("CONFIG_KASAN_INLINE enabled\n"); + pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n"); } return NOTIFY_OK; } -- cgit v1.2.3