From 4d178f94ebe123d462a51169b53854cb7f198888 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:45 -0400 Subject: x86/asm: Merge common 32-bit values in asm-offsets.c Merge common values for 32-bit native and compat. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-1-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 19 +++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 15 --------------- arch/x86/kernel/asm-offsets_64.c | 21 --------------------- 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9f6b9341950f..b27f6ec90caa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -41,6 +41,25 @@ void common(void) { OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + + BLANK(); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 47703aed74cf..628bfd4c06bb 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -17,17 +17,6 @@ void foo(void); void foo(void) { - OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); - BLANK(); - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); @@ -37,7 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); @@ -60,9 +48,6 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); - BLANK(); - OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5ce6f2da8763..dcaab87da629 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -29,27 +29,6 @@ int main(void) BLANK(); #endif -#ifdef CONFIG_IA32_EMULATION - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); - -#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) - ENTRY(ax); - ENTRY(bx); - ENTRY(cx); - ENTRY(dx); - ENTRY(si); - ENTRY(di); - ENTRY(bp); - ENTRY(sp); - ENTRY(ip); - BLANK(); -#undef ENTRY - - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); - BLANK(); -#endif - #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); -- cgit v1.2.3 From 14434052ffb3b7fe8f491e9d0a7793376fb79155 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:46 -0400 Subject: x86/asm: Remove unused TI_cpu Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 628bfd4c06bb..6ce39025f467 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -26,9 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_cpu, thread_info, cpu); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); -- cgit v1.2.3 From c0f6feba784e1087b905ad097d2d9ac0aaf744a5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Apr 2015 08:50:14 +0200 Subject: x86/asm, x86/acpi/wakeup_64.S: Make global label a local one Make it a local symbol so that it doesn't appear in objdump output. No functionality change - code remains the same, just the global label disappears: ffffffff81039dbe: bf 03 00 00 00 mov $0x3,%edi ffffffff81039dc3: 31 c0 xor %eax,%eax ffffffff81039dc5: e8 b6 fd ff ff callq ffffffff81039b80 -ffffffff81039dca: eb 00 jmp ffffffff81039dcc - -ffffffff81039dcc : +ffffffff81039dca: eb 00 jmp ffffffff81039dcc ffffffff81039dcc: 48 c7 c0 80 1a ca 82 mov $0xffffffff82ca1a80,%rax ffffffff81039dd3: 48 8b 98 e2 00 00 00 mov 0xe2(%rax),%rbx ffffffff81039dda: 0f 22 e3 mov %rbx,%cr4 Signed-off-by: Borislav Petkov Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429080614-22610-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/wakeup_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index ae693b51ed8e..8c35df468104 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel) pushfq popq pt_regs_flags(%rax) - movq $resume_point, saved_rip(%rip) + movq $.Lresume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax call x86_acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ - jmp resume_point + jmp .Lresume_point .align 4 -resume_point: +.Lresume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx -- cgit v1.2.3 From aac82d319148c6a84e1bf90b86d3e0ec8bf0ee38 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Apr 2015 15:51:54 -0700 Subject: x86, paravirt, xen: Remove the 64-bit ->irq_enable_sysexit() pvop We don't use irq_enable_sysexit on 64-bit kernels any more. Remove all the paravirt and Xen machinery to support it on 64-bit kernels. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8a03355698fe5b94194e9e7360f19f91c1b2cf1f.1428100853.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/paravirt.c | 4 +++- arch/x86/kernel/paravirt_patch_64.c | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b27f6ec90caa..8e3d22a1af94 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -68,7 +68,9 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); +#ifdef CONFIG_X86_32 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); +#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..7563114d9c3a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || +#ifdef CONFIG_X86_32 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || +#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_32) .irq_enable_sysexit = native_irq_enable_sysexit, #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..0de21c62c348 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -49,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); -- cgit v1.2.3 From 17be0aec74fb036eb4eb32c2268f3420a034762b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:27:29 +0200 Subject: x86/asm/entry/64: Implement better check for canonical addresses This change makes the check exact (no more false positives on "negative" addresses). Andy explains: "Canonical addresses either start with 17 zeros or 17 ones. In the old code, we checked that the top (64-47) = 17 bits were all zero. We did this by shifting right by 47 bits and making sure that nothing was left. In the new code, we're shifting left by (64 - 48) = 16 bits and then signed shifting right by the same amount, this propagating the 17th highest bit to all positions to its left. If we get the same value we started with, then we're good to go." While it isn't really important to be fully correct here - almost all addresses we'll ever see will be userspace ones, but OTOH it looks to be cheap enough: the new code uses two more ALU ops but preserves %rcx, allowing to not reload it from pt_regs->cx again. On disassembly level, the changes are: cmp %rcx,0x80(%rsp) -> mov 0x80(%rsp),%r11; cmp %rcx,%r11 shr $0x2f,%rcx -> shl $0x10,%rcx; sar $0x10,%rcx; cmp %rcx,%r11 mov 0x58(%rsp),%rcx -> (eliminated) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429633649-20169-1-git-send-email-dvlasenk@redhat.com [ Changelog massage. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..3c78a15a537d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -410,26 +410,27 @@ syscall_return: * a completely clean 64-bit userspace context. */ movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) + * the kernel, since userspace controls RSP. * - * If virtual addresses ever become wider, this will need + * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed @@ -466,8 +467,8 @@ syscall_return: */ syscall_return_via_sysret: CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE -- cgit v1.2.3 From ac7f5dfb0348a33b2ea92a0c477103c4db45ad4e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:03:13 +0200 Subject: x86/asm/entry/64: Merge 32-bit execve stubs with x32 ones, as they are identical Run-tested. Suggested-by: Brian Gerst Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429632194-13445-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3c78a15a537d..e952f6bf1d6d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -525,40 +525,27 @@ GLOBAL(stub_execveat) CFI_ENDPROC END(stub_execveat) -#ifdef CONFIG_X86_X32_ABI +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve CFI_ENDPROC +END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) -#endif - -#ifdef CONFIG_IA32_EMULATION - .align 8 -GLOBAL(stub32_execve) - CFI_STARTPROC - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub32_execve) - .align 8 GLOBAL(stub32_execveat) CFI_STARTPROC + DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve CFI_ENDPROC END(stub32_execveat) +END(stub_x32_execveat) #endif /* -- cgit v1.2.3 From 5f0052f9522b84269e1b3b435a806f873d992702 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:23 +0800 Subject: x86/irq: Save destination CPU ID in irq_cfg Cache destination CPU APIC ID into struct irq_cfg when assigning vector for interrupt. Upper layer just needs to read the cached APIC ID instead of calling apic->cpu_mask_to_apicid_and(), it helps to hide APIC driver details from IOAPIC/HPET/MSI drivers.. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..c724ef6b218c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -188,6 +188,12 @@ next: } free_cpumask_var(tmp_mask); + if (!err) { + /* cache destination APIC IDs into cfg->dest_apicid */ + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, + &cfg->dest_apicid); + } + return err; } -- cgit v1.2.3 From b5dc8e6c21e7ffba0246bf39cea97805c142bf85 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:24 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors Abstract CPU local APIC as an interrupt controller and create an irqdomain for it to manage CPU interrupt vectors. It's the base to enable hierarchical irqdomains on x86 systems. The final irqdomain hierarchy will look like this: IOAPIC domain ----| MSI/MSI-x domain ----> [Interrupt Remapping domain] -> CPU vector domain HPET_IRQ domain ----| ^ | DMAR domain ----------------------------------------------| HT_IRQ domain ----------------------------------------------| Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Prarit Bhargava Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 3 - arch/x86/kernel/apic/vector.c | 155 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 142 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..56d532106ef3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2356,9 +2356,6 @@ static int mp_irqdomain_create(int ioapic) ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); - if (gsi_cfg->gsi_base == 0) - irq_set_default_host(ip->irqdomain); - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c724ef6b218c..6358d8d351f5 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,7 +21,9 @@ #include #include +struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static struct irq_chip lapic_controller; void lock_vector_lock(void) { @@ -36,15 +40,21 @@ void unlock_vector_lock(void) struct irq_cfg *irq_cfg(unsigned int irq) { - return irq_get_chip_data(irq); + return irqd_cfg(irq_get_irq_data(irq)); } struct irq_cfg *irqd_cfg(struct irq_data *irq_data) { + if (!irq_data) + return NULL; + + while (irq_data->parent_data) + irq_data = irq_data->parent_data; + return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +static struct irq_cfg *alloc_irq_cfg(int node) { struct irq_cfg *cfg; @@ -79,7 +89,7 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } - cfg = alloc_irq_cfg(at, node); + cfg = alloc_irq_cfg(node); if (cfg) irq_set_chip_data(at, cfg); else @@ -87,14 +97,13 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (cfg) { + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); + } } static int @@ -241,6 +250,90 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&vector_lock, flags); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static inline const struct cpumask * +irq_alloc_info_get_mask(struct irq_alloc_info *info) +{ + return (!info || !info->mask) ? apic->target_cpus() : info->mask; +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irq_data && irq_data->chip_data) { + free_remapped_irq(virq); + clear_irq_vector(virq + i, irq_data->chip_data); + free_irq_cfg(irq_data->chip_data); + irq_domain_reset_irq_data(irq_data); + } + } +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + const struct cpumask *mask; + struct irq_data *irq_data; + struct irq_cfg *cfg; + int i, err; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + mask = irq_alloc_info_get_mask(info); + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irq_data); + cfg = alloc_irq_cfg(irq_data->node); + if (!cfg) { + err = -ENOMEM; + goto error; + } + + irq_data->chip = &lapic_controller; + irq_data->chip_data = cfg; + irq_data->hwirq = virq + i; + err = assign_irq_vector(virq, cfg, mask); + if (err) + goto error; + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i + 1); + return err; +} + +static struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +}; + int __init arch_probe_nr_irqs(void) { int nr; @@ -266,6 +359,11 @@ int __init arch_probe_nr_irqs(void) int __init arch_early_irq_init(void) { + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + return arch_early_ioapic_init(); } @@ -380,6 +478,36 @@ int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, return 0; } +static int vector_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) +{ + struct irq_cfg *cfg = irq_data->chip_data; + int err, irq = irq_data->irq; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(dest, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, dest); + if (err) { + struct irq_data *top = irq_get_irq_data(irq); + + if (assign_irq_vector(irq, cfg, top->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip lapic_controller = { + .irq_ack = apic_ack_edge, + .irq_set_affinity = vector_set_affinity, + .irq_retrigger = apic_retrigger_irq, +}; + #ifdef CONFIG_SMP void send_cleanup_vector(struct irq_cfg *cfg) { @@ -497,7 +625,7 @@ int arch_setup_hwirq(unsigned int irq, int node) unsigned long flags; int ret; - cfg = alloc_irq_cfg(irq, node); + cfg = alloc_irq_cfg(node); if (!cfg) return -ENOMEM; @@ -508,7 +636,7 @@ int arch_setup_hwirq(unsigned int irq, int node) if (!ret) irq_set_chip_data(irq, cfg); else - free_irq_cfg(irq, cfg); + free_irq_cfg(cfg); return ret; } @@ -518,7 +646,8 @@ void arch_teardown_hwirq(unsigned int irq) free_remapped_irq(irq); clear_irq_vector(irq, cfg); - free_irq_cfg(irq, cfg); + irq_set_chip_data(irq, NULL); + free_irq_cfg(cfg); } static void __init print_APIC_field(int base) -- cgit v1.2.3 From bd8eb63f8a3907bb477992145cb6ce0064a1e43f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:25 +0800 Subject: x86/hpet: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HPET, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1416894816-23245-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3acbff4716b0..ae29554f57ea 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -476,7 +477,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return -EINVAL; } return 0; @@ -484,9 +485,10 @@ static int hpet_setup_msi_irq(unsigned int irq) static int hpet_assign_irq(struct hpet_dev *dev) { - unsigned int irq = irq_alloc_hwirq(-1); + int irq; - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -EINVAL; irq_set_handler_data(irq, dev); -- cgit v1.2.3 From 4c8f9960ee497020d0858362c81ece984bc89aa5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:26 +0800 Subject: x86/MSI: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for PCI MSI, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1416894816-23245-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d6ba2d660dc5..76cc2c902176 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -146,23 +147,20 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct msi_desc *msidesc; - unsigned int irq; - int node, ret; + int irq, ret; /* Multiple MSI vectors only supported with interrupt remapping */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - node = dev_to_node(&dev->dev); - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -ENOSPC; ret = setup_msi_irq(dev, msidesc, irq, 0); if (ret < 0) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return ret; } @@ -172,7 +170,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) void native_teardown_msi_irq(unsigned int irq) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } #ifdef CONFIG_DMAR_TABLE -- cgit v1.2.3 From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:28 +0800 Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. This patch changes the interfaces between arch independent PCI driver and arch specific code. Currently HT_IRQ is only enabled on x86, so it does not affect other architectures. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..b307ee7a7148 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -61,31 +62,30 @@ static struct irq_chip ht_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; +int arch_alloc_ht_irq(struct pci_dev *dev) +{ + return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); +} + +void arch_free_ht_irq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} + int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; struct ht_irq_msg msg; - unsigned dest; - int err; if (disable_apic) return -ENXIO; cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : -- cgit v1.2.3 From a62b32cdd0a6324c959f40b3c9b928b275297066 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:29 +0800 Subject: x86/dmar: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for DMAR and interrupt remapping, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. The private definitions of irq_alloc_hwirqs()/irq_free_hwirqs() are a temporary solution, they will be removed once we have converted the interrupt remapping driver to use irqdomain framework. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 76cc2c902176..9be7d6d8a579 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -223,6 +223,16 @@ int arch_setup_dmar_msi(unsigned int irq) "edge"); return 0; } + +int dmar_alloc_hwirq(void) +{ + return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} #endif /* -- cgit v1.2.3 From 3cb96f0c97330834929abe9bd2ca3c252a83def0 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:34 +0800 Subject: x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428905519-23704-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 166 +++++++++++++++++++++++++++++++++++++++------ arch/x86/kernel/hpet.c | 57 ++++------------ 2 files changed, 161 insertions(+), 62 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9be7d6d8a579..10d9ae8f2166 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -51,6 +51,44 @@ void native_compose_msi_msg(struct pci_dev *pdev, MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_cfg *cfg = irqd_cfg(data); + + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL : + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU : + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(cfg->dest_apicid); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED : + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); +} + +static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) +{ + struct irq_cfg *cfg = irqd_cfg(irq_data); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(cfg->vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); +} + static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg, u8 hpet_id) { @@ -239,44 +277,43 @@ void dmar_free_hwirq(int irq) * MSI message composition */ #ifdef CONFIG_HPET_TIMER +static inline int hpet_dev_id(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_data *parent = data->parent_data; struct msi_msg msg; - unsigned int dest; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - hpet_msi_read(data->handler_data, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - hpet_msi_write(data->handler_data, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + hpet_msi_read(data->handler_data, &msg); + msi_update_msg(&msg, data); + hpet_msi_write(data->handler_data, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip hpet_msi_type = { +static struct irq_chip hpet_msi_controller = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; int default_setup_hpet_msi(unsigned int irq, unsigned int id) { - struct irq_chip *chip = &hpet_msi_type; + struct irq_chip *chip = &hpet_msi_controller; struct msi_msg msg; int ret; @@ -291,4 +328,95 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id) irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } + +static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) + return -EINVAL; + if (irq_find_mapping(domain, info->hpet_index)) { + pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, + &hpet_msi_controller, NULL); + irq_set_handler_data(virq, info->hpet_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void hpet_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static void hpet_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static struct irq_domain_ops hpet_domain_ops = { + .alloc = hpet_domain_alloc, + .free = hpet_domain_free, + .activate = hpet_domain_activate, + .deactivate = hpet_domain_deactivate, +}; + +struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct irq_domain *parent; + struct irq_alloc_info info; + + if (x86_vector_domain == NULL) + return NULL; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_id = hpet_id; + parent = irq_remapping_get_ir_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + + return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, + (void *)(long)hpet_id); +} + +int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_data = dev; + info.hpet_id = hpet_dev_id(domain); + info.hpet_index = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); +} #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ae29554f57ea..e3bc18080052 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -306,8 +306,6 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static int hpet_setup_msi_irq(unsigned int irq); - static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -358,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_enable_legacy_int(); } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_setup_msi_irq(hdev->irq); + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); @@ -424,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; +static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { @@ -474,32 +473,6 @@ static int hpet_msi_next_event(unsigned long delta, return hpet_next_event(delta, evt, hdev->num); } -static int hpet_setup_msi_irq(unsigned int irq) -{ - if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_domain_free_irqs(irq, 1); - return -EINVAL; - } - return 0; -} - -static int hpet_assign_irq(struct hpet_dev *dev) -{ - int irq; - - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -EINVAL; - - irq_set_handler_data(irq, dev); - - if (hpet_setup_msi_irq(irq)) - return -EINVAL; - - dev->irq = irq; - return 0; -} - static irqreturn_t hpet_interrupt_handler(int irq, void *data) { struct hpet_dev *dev = (struct hpet_dev *)data; @@ -542,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) if (!(hdev->flags & HPET_DEV_VALID)) return; - if (hpet_setup_msi_irq(hdev->irq)) - return; - hdev->cpu = cpu; per_cpu(cpu_hpet_dev, cpu) = hdev; evt->name = hdev->name; @@ -576,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int id; unsigned int num_timers; unsigned int num_timers_used = 0; - int i; + int i, irq; if (hpet_msi_disable) return; @@ -589,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers++; /* Value read out starts from 0 */ hpet_print_config(); + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) return; @@ -603,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); + if (irq < 0) + continue; + + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + hdev->irq = irq; hdev->flags = 0; if (cfg & HPET_TN_PERIODIC_CAP) hdev->flags |= HPET_DEV_PERI_CAP; - hdev->num = i; - - sprintf(hdev->name, "hpet%d", i); - if (hpet_assign_irq(hdev)) - continue; - hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; @@ -711,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else -static int hpet_setup_msi_irq(unsigned int irq) -{ - return 0; -} static void hpet_msi_capability_lookup(unsigned int start_timer) { return; -- cgit v1.2.3 From 52f518a3a7c2f80551a38d38be28bc9f335e713c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:35 +0800 Subject: x86/MSI: Use hierarchical irqdomains to manage MSI interrupts Enhance MSI code to support hierarchical irqdomains, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-14-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 141 ++++++++++++++++++++++-------------------- arch/x86/kernel/apic/vector.c | 2 + 2 files changed, 77 insertions(+), 66 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 10d9ae8f2166..c426cd58844e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,6 +23,8 @@ #include #include +static struct irq_domain *msi_default_domain; + void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id) @@ -114,102 +118,107 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return 0; } -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - __get_cached_msi_msg(data->msi_desc, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - __pci_write_msi_msg(data->msi_desc, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ -static struct irq_chip msi_chip = { +static struct irq_chip pci_msi_controller = { .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_ack = apic_ack_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; + struct irq_domain *domain; + struct irq_alloc_info info; - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_MSI; + info.msi_dev = dev; - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); + domain = irq_remapping_get_irq_domain(&info); + if (domain == NULL) + domain = msi_default_domain; + if (domain == NULL) + return -ENOSYS; - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); + return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); +} - setup_remapped_irq(irq, irq_cfg(irq), chip); +void native_teardown_msi_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); +} - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); +static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->msi_hwirq; +} - dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); +static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + arg->msi_dev = pdev; + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } return 0; } -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { - struct msi_desc *msidesc; - int irq, ret; + arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); +} - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; +static struct msi_domain_ops pci_msi_domain_ops = { + .get_hwirq = pci_msi_get_hwirq, + .msi_prepare = pci_msi_prepare, + .set_desc = pci_msi_set_desc, +}; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -ENOSPC; +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_domain_free_irqs(irq, 1); - return ret; - } +void arch_init_msi_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; - } - return 0; + msi_default_domain = pci_msi_create_irq_domain(NULL, + &pci_msi_domain_info, parent); + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } -void native_teardown_msi_irq(unsigned int irq) +#ifdef CONFIG_IRQ_REMAP +struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - irq_domain_free_irqs(irq, 1); + return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); } +#endif #ifdef CONFIG_DMAR_TABLE static int diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6358d8d351f5..a8d82896be75 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -364,6 +364,8 @@ int __init arch_early_irq_init(void) BUG_ON(x86_vector_domain == NULL); irq_set_default_host(x86_vector_domain); + arch_init_msi_domain(x86_vector_domain); + return arch_early_ioapic_init(); } -- cgit v1.2.3 From 80aa283364a17998dceb577bd185e3380b927544 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:36 +0800 Subject: x86/irq: Directly call native_compose_msi_msg() for DMAR IRQ DMAR interrupt won't be remapped by interrupt remapping hardware, so directly call native_compose_msi_msg() for DMAR IRQ to compose MSI message data. This will help to simplify MSI code later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-15-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c426cd58844e..9adb87100ffe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -259,12 +259,10 @@ static struct irq_chip dmar_msi_type = { int arch_setup_dmar_msi(unsigned int irq) { - int ret; struct msi_msg msg; + struct irq_cfg *cfg = irq_cfg(irq); - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; + native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); -- cgit v1.2.3 From 7a53a12162cbe5feb66380b96cc794a031a8f39a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:39 +0800 Subject: irq_remapping: Clean up unused MSI related code Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Rafael J. Wysocki Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/x86_init.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..b094d691f2fe 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, - .compose_msi_msg = native_compose_msi_msg, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, - .setup_hpet_msi = default_setup_hpet_msi, }; /* MSI arch specific hooks */ -- cgit v1.2.3 From b1855c752e67d1125d41fadb499014b49a245db8 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:40 +0800 Subject: x86/MSI: Clean up unused MSI related code and interfaces Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 55 ++++------------------------------------------ 1 file changed, 4 insertions(+), 51 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9adb87100ffe..9fe7a08479fa 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,16 +25,12 @@ static struct irq_domain *msi_default_domain; -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg(irq); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); msg->address_lo = MSI_ADDR_BASE_LO | @@ -44,7 +40,7 @@ void native_compose_msi_msg(struct pci_dev *pdev, ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU : MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | @@ -93,31 +89,6 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); } -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); - - return 0; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -262,7 +233,7 @@ int arch_setup_dmar_msi(unsigned int irq) struct msi_msg msg; struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); + native_compose_msi_msg(cfg, &msg); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); @@ -318,24 +289,6 @@ static struct irq_chip hpet_msi_controller = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - struct irq_chip *chip = &hpet_msi_controller; - struct msi_msg msg; - int ret; - - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; - - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - return 0; -} - static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { -- cgit v1.2.3 From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:41 +0800 Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit Refine the interfaces to create IRQ for DMAR unit. It's a preparation for converting DMAR IRQ to hierarchical irqdomain on x86. It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h to dmar.h. They are not irq_remapping specific. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Tony Luck Cc: Fenghua Yu Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9fe7a08479fa..ca6250439acc 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -228,25 +228,27 @@ static struct irq_chip dmar_msi_type = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { + int irq; struct msi_msg msg; - struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(cfg, &msg); - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} + irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + native_compose_msi_msg(irq_cfg(irq), &msg); + dmar_msi_write(irq, &msg); + } -int dmar_alloc_hwirq(void) -{ - return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + return irq; } void dmar_free_hwirq(int irq) { + irq_set_handler_data(irq, NULL); + irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif -- cgit v1.2.3 From 0921f1da6425f05a1f56803069124b7ec13b79e2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:42 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage DMAR interrupts Enhance DMAR code to support hierarchical irqdomain, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 153 ++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ca6250439acc..f23d17d759b6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,32 +25,6 @@ static struct irq_domain *msi_default_domain; -static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); -} - static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); @@ -87,6 +61,9 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->data |= MSI_DATA_VECTOR(cfg->vector); msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); } /* @@ -196,59 +173,121 @@ static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; + struct irq_data *parent = data->parent_data; struct msi_msg msg; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); - - dmar_msi_write(irq, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + dmar_msi_read(data->irq, &msg); + msi_update_msg(&msg, data); + dmar_msi_write(data->irq, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_controller = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int dmar_alloc_hwirq(int id, int node, void *arg) +static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) + return -EINVAL; + if (irq_find_mapping(domain, info->dmar_id)) { + pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, + &dmar_msi_controller, NULL); + irq_set_handler_data(virq, info->dmar_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void dmar_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - int irq; struct msi_msg msg; - irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); - if (irq > 0) { - irq_set_handler_data(irq, arg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, - handle_edge_irq, "edge"); - native_compose_msi_msg(irq_cfg(irq), &msg); - dmar_msi_write(irq, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static void dmar_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static struct irq_domain_ops dmar_domain_ops = { + .alloc = dmar_domain_alloc, + .free = dmar_domain_free, + .activate = dmar_domain_activate, + .deactivate = dmar_domain_deactivate, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + + mutex_lock(&dmar_lock); + if (dmar_domain == NULL) { + dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); + if (dmar_domain) + dmar_domain->parent = x86_vector_domain; } + mutex_unlock(&dmar_lock); + + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.dmar_id = id; + info.dmar_data = arg; - return irq; + return irq_domain_alloc_irqs(domain, 1, node, &info); } void dmar_free_hwirq(int irq) { - irq_set_handler_data(irq, NULL); - irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif -- cgit v1.2.3 From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:43 +0800 Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport interrupts We have slightly changed the architecture interfaces to support htirq PCI driver. It's safe because currently Hypertransport interrupt is only enabled on x86 platforms. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 161 +++++++++++++++++++++++++++++++----------- arch/x86/kernel/apic/vector.c | 1 + 2 files changed, 121 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index b307ee7a7148..1cae104415ea 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,70 +21,104 @@ #include #include +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); + + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_alloc_ht_irq(struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; } -void arch_free_ht_irq(int irq) +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - irq_domain_free_irqs(irq, 1); + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg; struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(irq_data); - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq_data->irq, &msg); +} - write_ht_irq_msg(irq, &msg); +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} - dev_dbg(&dev->dev, "irq %d for HT\n", irq); +static struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; - return 0; +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a8d82896be75..b4b6b5a13440 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -365,6 +365,7 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); return arch_early_ioapic_init(); } -- cgit v1.2.3 From 81dabe2e739d5e0ad8ca2369738fb84bd64f967d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:45 +0800 Subject: x86/irq: Normalize x86 irq_chip name Some irq_chip names use underscore, others use hyphen. So normalize them to use hyphen as separator. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index f23d17d759b6..d17eb6a52c84 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -188,7 +188,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, } static struct irq_chip dmar_msi_controller = { - .name = "DMAR_MSI", + .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, @@ -319,7 +319,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, } static struct irq_chip hpet_msi_controller = { - .name = "HPET_MSI", + .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, -- cgit v1.2.3 From 68682a2687bf7dbe51309d297757a7ea6a96d312 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:46 +0800 Subject: x86/MSI: Simplify the way to deal with remapped MSI interrupts Simplify the way to deal with remapped MSI interrupts, so we can remove irq_chip.irq_print_chip later. We simply change the name when the setup detects that the parent domain is remapping. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d17eb6a52c84..87df03ae99ba 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -77,7 +77,6 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, @@ -143,7 +142,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + MSI_FLAG_PCI_MSIX, .ops = &pci_msi_domain_ops, .chip = &pci_msi_controller, .handler = handle_edge_irq, @@ -162,9 +161,29 @@ void arch_init_msi_domain(struct irq_domain *parent) } #ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = pci_msi_domain_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); + return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); } #endif @@ -325,7 +344,6 @@ static struct irq_chip hpet_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -402,6 +420,8 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) parent = irq_remapping_get_ir_irq_domain(&info); if (parent == NULL) parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, (void *)(long)hpet_id); -- cgit v1.2.3 From 90d84fe95dd6b418383aa0e0e5cace8f1b1e7e30 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:47 +0800 Subject: x86/MSI: Replace msi_update_msg() with irq_chip_compose_msi_msg() Function irq_chip_compose_msi_msg() can achieve the same goal as msi_update_msg(), so remove msi_update_msg(). Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 87df03ae99ba..5b5ef5bd23f5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -53,19 +53,6 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } -static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) -{ - struct irq_cfg *cfg = irqd_cfg(irq_data); - - msg->data &= ~MSI_DATA_VECTOR_MASK; - msg->data |= MSI_DATA_VECTOR(cfg->vector); - msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -198,8 +185,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { - dmar_msi_read(data->irq, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); dmar_msi_write(data->irq, &msg); } @@ -329,8 +315,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - hpet_msi_read(data->handler_data, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); hpet_msi_write(data->handler_data, &msg); } -- cgit v1.2.3 From 62ac1780830ed64a9a46f80a03e91de71957d670 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:48 +0800 Subject: x86/irq: Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips, they will be used to replace duplicated code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5b5ef5bd23f5..3c825867aeb5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -192,6 +192,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + static struct irq_chip dmar_msi_controller = { .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, @@ -200,6 +205,7 @@ static struct irq_chip dmar_msi_controller = { .irq_set_affinity = dmar_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -322,6 +328,11 @@ static int hpet_msi_set_affinity(struct irq_data *data, return ret; } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(data->handler_data, msg); +} + static struct irq_chip hpet_msi_controller = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, @@ -330,6 +341,7 @@ static struct irq_chip hpet_msi_controller = { .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.3 From e390d895ae14ad655c6b830e62a22a81b69290ef Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:49 +0800 Subject: x86/irq: Simplify MSI/DMAR/HPET implementation by using common code Use common MSI interfaces instead of private implementations of the same functionality to simplify DMAR/HPET driver implementation. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-28-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 192 +++++++++++++-------------------------------- 1 file changed, 54 insertions(+), 138 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 3c825867aeb5..109584261c4e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -62,10 +62,8 @@ static struct irq_chip pci_msi_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -153,9 +151,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -175,23 +171,6 @@ struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) #endif #ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0) { - irq_chip_compose_msi_msg(data, &msg); - dmar_msi_write(data->irq, &msg); - } - - return ret; -} - static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -202,67 +181,37 @@ static struct irq_chip dmar_msi_controller = { .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = dmar_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) +static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) - return -EINVAL; - if (irq_find_mapping(domain, info->dmar_id)) { - pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, - &dmar_msi_controller, NULL); - irq_set_handler_data(virq, info->dmar_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; + return arg->dmar_id; } -static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_domain_free_irqs_top(domain, virq, nr_irqs); -} + irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, + handle_edge_irq, arg->dmar_data, "edge"); -static void dmar_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - dmar_msi_write(irq_data->irq, &msg); + return 0; } -static void dmar_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - dmar_msi_write(irq_data->irq, &msg); -} +static struct msi_domain_ops dmar_msi_domain_ops = { + .get_hwirq = dmar_msi_get_hwirq, + .msi_init = dmar_msi_init, +}; -static struct irq_domain_ops dmar_domain_ops = { - .alloc = dmar_domain_alloc, - .free = dmar_domain_free, - .activate = dmar_domain_activate, - .deactivate = dmar_domain_deactivate, +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -271,11 +220,9 @@ static struct irq_domain *dmar_get_irq_domain(void) static DEFINE_MUTEX(dmar_lock); mutex_lock(&dmar_lock); - if (dmar_domain == NULL) { - dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); - if (dmar_domain) - dmar_domain->parent = x86_vector_domain; - } + if (dmar_domain == NULL) + dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + x86_vector_domain); mutex_unlock(&dmar_lock); return dmar_domain; @@ -309,23 +256,9 @@ void dmar_free_hwirq(int irq) #ifdef CONFIG_HPET_TIMER static inline int hpet_dev_id(struct irq_domain *domain) { - return (int)(long)domain->host_data; -} + struct msi_domain_info *info = msi_get_domain_info(domain); -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - irq_chip_compose_msi_msg(data, &msg); - hpet_msi_write(data->handler_data, &msg); - } - - return ret; + return (int)(long)info->data; } static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) @@ -338,79 +271,63 @@ static struct irq_chip hpet_msi_controller = { .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = hpet_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) - return -EINVAL; - if (irq_find_mapping(domain, info->hpet_index)) { - pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, - &hpet_msi_controller, NULL); - irq_set_handler_data(virq, info->hpet_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; -} - -static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_free_irqs_top(domain, virq, nr_irqs); + return arg->hpet_index; } -static void hpet_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct msi_msg msg; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, + handle_edge_irq, arg->hpet_data, "edge"); - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + return 0; } -static void hpet_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) { - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); } -static struct irq_domain_ops hpet_domain_ops = { - .alloc = hpet_domain_alloc, - .free = hpet_domain_free, - .activate = hpet_domain_activate, - .deactivate = hpet_domain_deactivate, +static struct msi_domain_ops hpet_msi_domain_ops = { + .get_hwirq = hpet_msi_get_hwirq, + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, }; struct irq_domain *hpet_create_irq_domain(int hpet_id) { struct irq_domain *parent; struct irq_alloc_info info; + struct msi_domain_info *domain_info; if (x86_vector_domain == NULL) return NULL; + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; info.hpet_id = hpet_id; @@ -420,8 +337,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) else hpet_msi_controller.name = "IR-HPET-MSI"; - return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, - (void *)(long)hpet_id); + return msi_create_irq_domain(NULL, domain_info, parent); } int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, -- cgit v1.2.3 From 6648d1b42c349d748839d7bad91cc8a65c73e262 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 14:11:51 +0800 Subject: x86/intel-mid: Delay initialization of APB timer MID has no PIC, but depending on the platform it requires the abt_timer, which is connected to irq0. The timer is set up at late_time_init(). But, looking at the MID code it seems, that there is no reason to do so. The only code which might need the timer working is the TSC calibration code, but thats a non issue on MID as that is using its own empty calibration function. And check_timer() is not invoked either because MID has no PIC and therefor no legacy irqs. So if you look at intel_mid_time_init() then you'll see that in the ARAT case the timer setup is skipped already. So until the point where x86_init.timers.setup_percpu_clockev() is called for the boot cpu nothing really needs a timer on MID. According to the MID code the apbt horror is only used for moorestown. Medfield and later use the local apic timer without the apbt nonsense. The best thing we can do is to drop moorestown support and get rid of that apbt nonsense alltogether. I don't think anyone deeply cares about it not being supported from 3.18 on. The number of devices which sport a moorestown should be pretty limited and the only relevant use case of those is to act as a pocket heater with short battery life time. Its pretty pointless to update kernels on pocket heaters except for bragging reasons. If someone at Intel really thinks that we need to keep moorestown alive for other than documentary and sentimental reasons, then we can move the apbt setup to x86_init.timers.setup_percpu_clockev(). At that point the IOAPIC is setup already, so it should just work. Signed-off-by: Thomas Gleixner Tested-by: Andy Shevchenko Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Kuppuswamy Sathyanarayanan Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Rickard Strandqvist Link: http://lkml.kernel.org/r/1428905519-23704-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6a7c23ff21d3..ede92c3364d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void) static void apbt_setup_irq(struct apbt_dev *adev) { - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); } -- cgit v1.2.3 From 4e69d7eab4c24aa88fb0ec99fad7feac254d9ece Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:53 +0800 Subject: x86/irq: Remove unused pre_init_apic_IRQ0() Now there's no user of pre_init_apic_IRQ0(), so remove it. Signed-off-by: Jiang Liu Tested-by: Andy Shevchenko Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 56d532106ef3..540598c77e55 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3091,20 +3091,3 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } - -/* Enable IOAPIC early just for system timer */ -void __init pre_init_apic_IRQ0(void) -{ - struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; - - printk(KERN_INFO "Early APIC setup for system timer0\n"); -#ifndef CONFIG_SMP - physid_set_mask_of_physid(boot_cpu_physical_apicid, - &phys_cpu_present_map); -#endif - setup_local_APIC(); - - io_apic_setup_irq_pin(0, 0, &attr); - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); -} -- cgit v1.2.3 From c4d05a2c354b15965c9b2a5f46016a5d9f43e224 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:54 +0800 Subject: x86/irq: Prepare IOAPIC interfaces to support hierarchical irqdomains Introduce helper functions to manipulate struct irq_alloc_info for IOAPIC. Also add an extra parameter to IOAPIC interfaces to prepare for hierarchical irqdomain. Function mp_set_gsi_attr() will be removed once we have switched to hierarchical irqdomains. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Jan Beulich Cc: Grant Likely Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 6 ++++-- arch/x86/kernel/apic/io_apic.c | 39 ++++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 803b684676ff..a43a4d3c60e1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -404,6 +404,7 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int irq, node; + struct irq_alloc_info info; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -416,7 +417,8 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, return -1; } - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); + ioapic_set_alloc_attr(&info, node, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) return irq; @@ -434,7 +436,7 @@ static void mp_unregister_gsi(u32 gsi) if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return; - irq = mp_map_gsi_to_irq(gsi, 0); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); if (irq > 0) mp_unmap_irq(irq); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 540598c77e55..5c953bb96ecf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -938,7 +938,19 @@ static int irq_trigger(int idx) return trigger; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic_node = node; + info->ioapic_trigger = trigger; + info->ioapic_polarity = polarity; + info->ioapic_valid = 1; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, + struct irq_alloc_info *info) { int irq = -1; int ioapic = (int)(long)domain->host_data; @@ -971,11 +983,11 @@ static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, - unsigned int flags) + unsigned int flags, struct irq_alloc_info *info) { int irq; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *info = mp_pin_info(ioapic, pin); + struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) return -1; @@ -997,30 +1009,30 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; if (flags & IOAPIC_MAP_ALLOC) { - if (info->count == 0 && + if (pinfo->count == 0 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; /* special handling for timer IRQ0 */ if (irq == 0) - info->count++; + pinfo->count++; } } else { irq = irq_find_mapping(domain, pin); if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin); + irq = alloc_irq_from_domain(domain, gsi, pin, info); } if (flags & IOAPIC_MAP_ALLOC) { /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && info->count == 1 && + if (irq < nr_legacy_irqs() && pinfo->count == 1 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; if (irq > 0) - info->count++; - else if (info->count == 0) - info->set = 0; + pinfo->count++; + else if (pinfo->count == 0) + pinfo->set = 0; } mutex_unlock(&ioapic_mutex); @@ -1058,10 +1070,11 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) } #endif - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1074,7 +1087,7 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) if ((flags & IOAPIC_MAP_CHECK) && idx < 0) return -1; - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } void mp_unmap_irq(int irq) -- cgit v1.2.3 From 49c7e60022912d10da88ba67e8eb2927f1143f6a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:55 +0800 Subject: x86/irq: Implement callbacks to enable hierarchical irqdomains on IOAPICs Implement required callbacks to prepare for enabling hierarchical irqdomains on IOAPICs. After the conversion we can remove quite some code from the old implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 159 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c953bb96ecf..3406dbec1570 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,6 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct mp_chip_data { + struct IO_APIC_route_entry entry; + int trigger; + int polarity; + bool isa_irq; +}; + struct mp_pin_info { int trigger; int polarity; @@ -949,11 +956,28 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { int irq = -1; - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { @@ -3029,7 +3053,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); struct io_apic_irq_attr attr; @@ -3067,7 +3091,7 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int pin = (int)data->hwirq; ioapic_mask_entry(ioapic, pin); @@ -3076,6 +3100,130 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) arch_teardown_hwirq(virq); } +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->ioapic_valid) { + data->trigger = info->ioapic_trigger; + data->polarity = info->ioapic_polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always polarity one level triggered. */ + data->trigger = 1; + data->polarity = 1; + } +} + +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) +{ + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (data->trigger) + entry->mask = 1; +} + +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; + + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; + + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic_pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + info->ioapic_entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + irq_data->hwirq = info->ioapic_pin; + irq_data->chip = &ioapic_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + if (info->ioapic_entry) + mp_setup_entry(cfg, data, info->ioapic_entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); + + return 0; +} + +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_cfg *cfg = irq_cfg(virq); + struct irq_data *irq_data; + + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&cfg->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + unsigned long flags; + struct irq_pin_list *entry; + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, data->entry); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) { int ret = 0; @@ -3104,3 +3252,8 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} -- cgit v1.2.3 From 133153205b263ea9ce4e771876ede544f896e034 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:56 +0800 Subject: x86/irq: Refine the way to allocate irq_cfg for legacy IRQs To support legacy ISA IRQs, we need to preallocate irq_cfg structures for legacy ISA IRQs. Refine the way to allocate irq_cfg for legacy ISA IRQs, so it's more friendly for the hierarchical irqdomain implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 13 +------------ arch/x86/kernel/apic/vector.c | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3406dbec1570..16d4ba3ac844 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -254,8 +254,7 @@ static void free_ioapic_saved_registers(int idx) int __init arch_early_ioapic_init(void) { - struct irq_cfg *cfg; - int i, node = cpu_to_node(0); + int i; if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; @@ -263,16 +262,6 @@ int __init arch_early_ioapic_init(void) for_each_ioapic(i) alloc_ioapic_saved_registers(i); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - } - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b4b6b5a13440..633f03268d48 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -24,6 +24,9 @@ struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; +#ifdef CONFIG_X86_IO_APIC +static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +#endif void lock_vector_lock(void) { @@ -283,6 +286,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs()) + legacy_irq_cfgs[virq + i] = NULL; +#endif irq_domain_reset_irq_data(irq_data); } } @@ -308,7 +315,12 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); - cfg = alloc_irq_cfg(irq_data->node); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) + cfg = legacy_irq_cfgs[virq + i]; + else +#endif + cfg = alloc_irq_cfg(irq_data->node); if (!cfg) { err = -ENOMEM; goto error; @@ -357,8 +369,36 @@ int __init arch_probe_nr_irqs(void) return nr_legacy_irqs(); } +#ifdef CONFIG_X86_IO_APIC +static void init_legacy_irqs(void) +{ + int i, node = cpu_to_node(0); + struct irq_cfg *cfg; + + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); + BUG_ON(!cfg); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + cfg->vector = IRQ0_VECTOR + i; + cpumask_setall(cfg->domain); + irq_set_chip_data(i, cfg); + } +} +#else +static void init_legacy_irqs(void) { } +#endif + int __init arch_early_irq_init(void) { + init_legacy_irqs(); + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); -- cgit v1.2.3 From a44174ee7b380012cdb63d563617f67bb7757649 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:57 +0800 Subject: x86/irq: Simplify the way to print IOAPIC entry Simplify the way to print IOAPIC entry content, so we can remove native_io_apic_print_entries(), intel_ir_io_apic_print_entries() and x86_io_apic_ops.print_entries() later. Folded a patch from Thomas to fix errors in printed pin attributes, http://www.spinics.net/lists/linux-tip-commits/msg26108.html Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-36-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 16d4ba3ac844..3c6609617306 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1424,6 +1424,33 @@ void ioapic_zap_locks(void) raw_spin_lock_init(&ioapic_lock); } +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.mask ? "disabled" : "enabled ", + entry.trigger ? "level" : "edge ", + entry.polarity ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, entry.dest_mode ? "logical " : "physical", + entry.dest, entry.delivery_mode); + } +} + static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -1477,8 +1504,7 @@ static void __init print_IO_APIC(int ioapic_idx) } printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } void __init print_IO_APICs(void) -- cgit v1.2.3 From 96ed44b2d5e0e9d6e5b135e84ea5c8cd763ce861 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:58 +0800 Subject: x86/irq: Introduce helper functions to support hierarchical irqdomains for IOAPIC Introduce several helper functions, which will be used to enable hierarchical irqdomain for IOAPIC. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-37-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c6609617306..c8f786b5b91c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -82,6 +82,7 @@ struct mp_chip_data { struct IO_APIC_route_entry entry; int trigger; int polarity; + u32 count; bool isa_irq; }; @@ -945,6 +946,46 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); +#endif + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->ioapic_id = mpc_ioapic_id(ioapic_idx); + dst->ioapic_pin = pin; + dst->ioapic_valid = 1; + if (src && src->ioapic_valid) { + dst->ioapic_node = src->ioapic_node; + dst->ioapic_trigger = src->ioapic_trigger; + dst->ioapic_polarity = src->ioapic_polarity; + } else { + dst->ioapic_node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic_trigger = trigger; + dst->ioapic_polarity = polarity; + } else { + /* + * PCI interrupts are always polarity one level + * triggered. + */ + dst->ioapic_trigger = 1; + dst->ioapic_polarity = 1; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; +} + static void mp_register_handler(unsigned int irq, unsigned long trigger) { irq_flow_handler_t hdl; @@ -962,6 +1003,26 @@ static void mp_register_handler(unsigned int irq, unsigned long trigger) __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); } +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic_trigger != data->trigger) + mp_register_handler(irq, data->trigger); + data->entry.trigger = data->trigger = info->ioapic_trigger; + data->entry.polarity = data->polarity = info->ioapic_polarity; + } + + return data->trigger == info->ioapic_trigger && + data->polarity == info->ioapic_polarity; +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { -- cgit v1.2.3 From d32932d02e1869be838cea3ace42467c360db377 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:59 +0800 Subject: x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces Convert IOAPIC driver to support and use hierarchical irqdomain interfaces. It's a little big, but would break bisecting if we split it into multiple patches. Fold in a patch from Andy Shevchenko to make it bisectable. http://lkml.org/lkml/2014/12/10/622 Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Andy Shevchenko Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: David Rientjes Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-38-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 11 +- arch/x86/kernel/apic/io_apic.c | 308 ++++++++++++++++++++++++++++------------- arch/x86/kernel/devicetree.c | 37 ++--- arch/x86/kernel/mpparse.c | 6 +- 4 files changed, 234 insertions(+), 128 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a43a4d3c60e1..21e460b3b360 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -412,11 +412,6 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { - pr_warn("Failed to set pin attr for GSI%d\n", gsi); - return -1; - } - ioapic_set_alloc_attr(&info, node, trigger, polarity); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) @@ -442,8 +437,10 @@ static void mp_unregister_gsi(u32 gsi) } static struct irq_domain_ops acpi_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static int __init diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c8f786b5b91c..ba50f8d6f6b0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -142,6 +142,11 @@ u32 mp_pin_to_gsi(int ioapic, int pin) return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + /* * Initialize all legacy IRQs and all pins on the first IOAPIC * if we have legacy interrupt controller. Kernel boot option "pirq=" @@ -152,7 +157,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) if (!nr_legacy_irqs()) return 0; - return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); + return ioapic == 0 || mp_is_legacy_irq(irq); } static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) @@ -231,7 +236,7 @@ struct irq_pin_list { static struct irq_pin_list *alloc_irq_pin_list(int node) { - return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); + return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); } static void alloc_ioapic_saved_registers(int idx) @@ -560,6 +565,17 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) } } +void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +{ + unsigned long flags; + struct irq_pin_list *entry; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + native_eoi_ioapic_pin(entry->apic, entry->pin, vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; @@ -603,9 +619,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); - x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + native_eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1023,95 +1038,121 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) data->polarity == info->ioapic_polarity; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + bool legacy = false; int irq = -1; - int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: /* - * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 - * GSIs on some weird platforms. + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. */ - if (gsi < nr_legacy_irqs()) - irq = irq_create_mapping(domain, pin); - else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) irq = gsi; + legacy = mp_is_legacy_irq(irq); break; case IOAPIC_DOMAIN_STRICT: - if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) - irq = gsi; + irq = gsi; break; case IOAPIC_DOMAIN_DYNAMIC: - irq = irq_create_mapping(domain, pin); break; default: WARN(1, "ioapic: unknown irqdomain type %d\n", type); - break; + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + struct irq_cfg *cfg = irqd_cfg(irq_data); + + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + return -ENOMEM; + } else { + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } } - return irq > 0 ? irq : -1; + return irq; } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) - return -1; + return -ENOSYS; - mutex_lock(&ioapic_mutex); - - /* - * Don't use irqdomain to manage ISA IRQs because there may be - * multiple IOAPIC pins sharing the same ISA IRQ number and - * irqdomain only supports 1:1 mapping between IOAPIC pin and - * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used - * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). - * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are - * available, and some BIOSes may use MP Interrupt Source records - * to override IRQ numbers for PIRQs instead of reprogramming - * the interrupt routing logic. Thus there may be multiple pins - * sharing the same legacy IRQ number when ACPI is disabled. - */ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; - if (flags & IOAPIC_MAP_ALLOC) { - if (pinfo->count == 0 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; + legacy = mp_is_legacy_irq(irq); + } - /* special handling for timer IRQ0 */ + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); if (irq == 0) - pinfo->count++; + irq = -ENOENT; } } else { - irq = irq_find_mapping(domain, pin); - if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin, info); - } - - if (flags & IOAPIC_MAP_ALLOC) { - /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && pinfo->count == 1 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; - - if (irq > 0) - pinfo->count++; - else if (pinfo->count == 0) - pinfo->set = 0; + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } } - mutex_unlock(&ioapic_mutex); - return irq > 0 ? irq : -1; + return irq; } static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) @@ -1166,26 +1207,19 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, void mp_unmap_irq(int irq) { - struct irq_data *data = irq_get_irq_data(irq); - struct mp_pin_info *info; - int ioapic, pin; + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; - if (!data || !data->domain) + if (!irq_data || !irq_data->domain) return; - ioapic = (int)(long)data->domain->host_data; - pin = (int)data->hwirq; - info = mp_pin_info(ioapic, pin); + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; mutex_lock(&ioapic_mutex); - if (--info->count == 0) { - info->set = 0; - if (irq < nr_legacy_irqs() && - ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) - mp_irqdomain_unmap(data->domain, irq); - else - irq_dispose_mapping(irq); - } + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); mutex_unlock(&ioapic_mutex); } @@ -1252,7 +1286,7 @@ out: } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -static struct irq_chip ioapic_chip; +static struct irq_chip ioapic_chip, ioapic_ir_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1595,7 +1629,7 @@ void __init print_IO_APICs(void) struct irq_pin_list *entry; chip = irq_get_chip(irq); - if (chip != &ioapic_chip) + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; cfg = irq_cfg(irq); @@ -2057,12 +2091,12 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, } #endif -static void ack_ioapic_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); - int i, irq = data->irq; unsigned long v; bool masked; + int i; irq_complete_move(cfg); masked = ioapic_irqd_mask(data, cfg); @@ -2117,22 +2151,70 @@ static void ack_ioapic_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - - eoi_ioapic_irq(irq, cfg); + eoi_ioapic_pin(cfg->vector, cfg); } ioapic_irqd_unmask(data, cfg, masked); } +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + ack_APIC_irq(); + eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct mp_chip_data *data = irq_data->chip_data; + unsigned int dest, irq = irq_data->irq; + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + cfg = irqd_cfg(irq_data); + data->entry.dest = cfg->dest_apicid; + data->entry.vector = cfg->vector; + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); + __target_IO_APIC_irq(irq, dest, cfg); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = apic_ack_edge, - .irq_eoi = ack_ioapic_level, - .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2265,6 +2347,24 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_alloc_info info; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.ioapic_id = mpc_ioapic_id(ioapic); + info.ioapic_pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2287,7 +2387,6 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ legacy_pic->mask(0); - assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2328,15 +2427,12 @@ static inline void __init check_timer(void) } if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ + /* Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_node(cfg, node, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + mp_alloc_timer_irq(apic1, pin1); } else { - /* for edge trigger, setup_ioapic_irq already - * leave it unmasked. + /* + * for edge trigger, it's already unmasked, * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ @@ -2345,6 +2441,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic(cfg); } + irq_domain_activate_irq(irq_get_irq_data(0)); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2365,7 +2462,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + irq_domain_activate_irq(irq_get_irq_data(0)); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2443,6 +2540,8 @@ out: static int mp_irqdomain_create(int ioapic) { size_t size; + struct irq_alloc_info info; + struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; @@ -2456,9 +2555,18 @@ static int mp_irqdomain_create(int ioapic) if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info.ioapic_id = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_ir_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if(!ip->irqdomain) { + if (ip->irqdomain) { + ip->irqdomain->parent = parent; + } else { kfree(ip->pin_info); ip->pin_info = NULL; return -ENOMEM; @@ -3072,7 +3180,6 @@ int mp_unregister_ioapic(u32 gsi_base) { int ioapic, pin; int found = 0; - struct mp_pin_info *pin_info; for_each_ioapic(ioapic) if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { @@ -3085,11 +3192,17 @@ int mp_unregister_ioapic(u32 gsi_base) } for_each_pin(ioapic, pin) { - pin_info = mp_pin_info(ioapic, pin); - if (pin_info->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); - return -EBUSY; + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } } } @@ -3241,7 +3354,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } irq_data->hwirq = info->ioapic_pin; - irq_data->chip = &ioapic_chip; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..05103d398ed7 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *domain, - struct device_node *controller, - const u32 *intspec, u32 intsize, - irq_hw_number_t *out_hwirq, u32 *out_type) +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct of_phandle_args *irq_data = (void *)arg; struct of_ioapic_type *it; - u32 line, idx, gsi; + struct irq_alloc_info tmp; - if (WARN_ON(intsize < 2)) + if (WARN_ON(irq_data->args_count < 2)) return -EINVAL; - - line = intspec[0]; - - if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) + if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[intspec[1]]; + it = &of_ioapic_type[irq_data->args[1]]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic_pin = irq_data->args[0]; - idx = (u32)(long)domain->host_data; - gsi = mp_pin_to_gsi(idx, line); - if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) - return -EBUSY; - - *out_hwirq = line; - *out_type = it->out_type; - return 0; + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } const struct irq_domain_ops ioapic_irq_domain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, - .xlate = ioapic_xlate, + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 2d2a237f2c73..aa4feee74dbe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -114,8 +114,10 @@ static void __init MP_bus_info(struct mpc_bus *m) } static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init MP_ioapic_info(struct mpc_ioapic *m) -- cgit v1.2.3 From 5ad274d41c1b3f3ccf73591078efaa8ed6828a8d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:38 +0800 Subject: x86/irq: Remove unused old IOAPIC irqdomain interfaces Now we have converted to hierarchical irqdomain, so remove unused old IOAPIC interfaces and code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 202 +---------------------------------------- 1 file changed, 1 insertion(+), 201 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ba50f8d6f6b0..523b326d71c2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -89,7 +89,6 @@ struct mp_chip_data { struct mp_pin_info { int trigger; int polarity; - int node; int set; u32 count; }; @@ -1310,30 +1309,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, - unsigned long trigger) -{ - struct irq_chip *chip = &ioapic_chip; - irq_flow_handler_t hdl; - bool fasteoi; - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_set_status_flags(irq, IRQ_LEVEL); - fasteoi = true; - } else { - irq_clear_status_flags(irq, IRQ_LEVEL); - fasteoi = false; - } - - if (setup_remapped_irq(irq, cfg, chip)) - fasteoi = trigger != 0; - - hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; - irq_set_chip_and_handler_name(irq, chip, hdl, - fasteoi ? "fasteoi" : "edge"); -} - int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) @@ -1358,48 +1333,6 @@ int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, return 0; } -static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, - struct io_apic_irq_attr *attr) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - if (!IO_APIC_IRQ(irq)) - return; - - if (assign_irq_vector(irq, cfg, apic->target_cpus())) - return; - - if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), - &dest)) { - pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i Dest:%d)\n", - attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, - cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < nr_legacy_irqs()) - legacy_pic->mask(irq); - - ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1419,46 +1352,6 @@ static void __init setup_IO_APIC_irqs(void) } } -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), - apic->target_cpus(), &dest))) - dest = BAD_APICID; - - entry.dest_mode = apic->irq_dest_mode; - entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = dest; - entry.delivery_mode = apic->irq_delivery_mode; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_idx, pin, entry); -} - void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { int i; @@ -2669,20 +2562,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -static int -io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) -{ - struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); - int ret; - - if (!cfg) - return -EINVAL; - ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); - if (!ret) - setup_ioapic_irq(irq, cfg, attr); - return ret; -} - static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3239,58 +3118,8 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } -int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - int ioapic = mp_irqdomain_ioapic_idx(domain); - struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); - struct io_apic_irq_attr attr; - - /* Get default attribute if not set by caller yet */ - if (!info->set) { - u32 gsi = mp_pin_to_gsi(ioapic, hwirq); - - if (acpi_get_override_irq(gsi, &info->trigger, - &info->polarity) < 0) { - /* - * PCI interrupts are always polarity one level - * triggered. - */ - info->trigger = 1; - info->polarity = 1; - } - info->node = NUMA_NO_NODE; - - /* - * setup_IO_APIC_irqs() programs all legacy IRQs with default - * trigger and polarity attributes. Don't set the flag for that - * case so the first legacy IRQ user could reprogram the pin - * with real trigger and polarity attributes. - */ - if (virq >= nr_legacy_irqs() || info->count) - info->set = 1; - } - set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, - info->polarity); - - return io_apic_setup_irq_pin(virq, info->node, &attr); -} - -void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) -{ - struct irq_data *data = irq_get_irq_data(virq); - struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = mp_irqdomain_ioapic_idx(domain); - int pin = (int)data->hwirq; - - ioapic_mask_entry(ioapic, pin); - __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(!list_empty(&cfg->irq_2_pin)); - arch_teardown_hwirq(virq); -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, - struct irq_alloc_info *info) + struct irq_alloc_info *info) { if (info && info->ioapic_valid) { data->trigger = info->ioapic_trigger; @@ -3414,35 +3243,6 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, (int)irq_data->hwirq); } -int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) -{ - int ret = 0; - int ioapic, pin; - struct mp_pin_info *info; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -ENODEV; - - pin = mp_find_ioapic_pin(ioapic, gsi); - info = mp_pin_info(ioapic, pin); - trigger = trigger ? 1 : 0; - polarity = polarity ? 1 : 0; - - mutex_lock(&ioapic_mutex); - if (!info->set) { - info->trigger = trigger; - info->polarity = polarity; - info->node = node; - info->set = 1; - } else if (info->trigger != trigger || info->polarity != polarity) { - ret = -EBUSY; - } - mutex_unlock(&ioapic_mutex); - - return ret; -} - int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; -- cgit v1.2.3 From b75e818f7fc6db153a4ebfba1d31366c1cc531aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:39 +0800 Subject: x86/irq: Remove unused struct mp_pin_info Now nobody makes use of struct mp_pin_info, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 523b326d71c2..3506e8aeba91 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,13 +86,6 @@ struct mp_chip_data { bool isa_irq; }; -struct mp_pin_info { - int trigger; - int polarity; - int set; - u32 count; -}; - static struct ioapic { /* * # of IRQ routing registers @@ -108,7 +101,6 @@ static struct ioapic { struct mp_ioapic_gsi gsi_config; struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; - struct mp_pin_info *pin_info; struct resource *iomem_res; } ioapics[MAX_IO_APICS]; @@ -159,11 +151,6 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) return ioapic == 0 || mp_is_legacy_irq(irq); } -static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) -{ - return ioapics[ioapic_idx].pin_info + pin; -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; @@ -2432,7 +2419,6 @@ out: static int mp_irqdomain_create(int ioapic) { - size_t size; struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); @@ -2440,11 +2426,6 @@ static int mp_irqdomain_create(int ioapic) struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); - size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); - ip->pin_info = kzalloc(size, GFP_KERNEL); - if (!ip->pin_info) - return -ENOMEM; - if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; @@ -2457,13 +2438,10 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if (ip->irqdomain) { - ip->irqdomain->parent = parent; - } else { - kfree(ip->pin_info); - ip->pin_info = NULL; + if (!ip->irqdomain) return -ENOMEM; - } + + ip->irqdomain->parent = parent; if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) @@ -2479,8 +2457,6 @@ static void ioapic_destroy_irqdomain(int idx) irq_domain_remove(ioapics[idx].irqdomain); ioapics[idx].irqdomain = NULL; } - kfree(ioapics[idx].pin_info); - ioapics[idx].pin_info = NULL; } void __init setup_IO_APIC(void) -- cgit v1.2.3 From 84bea5cc7709dffdadfa9885a66efd67d9ffc24c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:40 +0800 Subject: x86/irq: Remove x86_io_apic_ops.print_entries and related interfaces Now there is no user of x86_io_apic_ops.print_entries anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 55 ------------------------------------------ arch/x86/kernel/x86_init.c | 1 - 2 files changed, 56 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3506e8aeba91..acb91c19f318 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1339,61 +1339,6 @@ static void __init setup_IO_APIC_irqs(void) } } -void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - pr_debug(" %02x %02X ", i, entry.dest); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector); - } -} - -void intel_ir_io_apic_print_entries(unsigned int apic, - unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IR_IO_APIC_route_entry *ir_entry; - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - ir_entry = (struct IR_IO_APIC_route_entry *)&entry; - - pr_debug(" %02x %04X ", i, ir_entry->index); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector); - } -} - void ioapic_zap_locks(void) { raw_spin_lock_init(&ioapic_lock); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index b094d691f2fe..d6f36c7594d7 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,7 +144,6 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .print_entries = native_io_apic_print_entries, .set_affinity = native_ioapic_set_affinity, .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, -- cgit v1.2.3 From 35d50d8fd5b8f932b3e71311a4cbd4384501ab9a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:41 +0800 Subject: x86/irq: Remove x86_io_apic_ops.setup_entry and related interfaces Now there is no user of x86_io_apic_ops.setup_entry anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 24 ------------------------ arch/x86/kernel/x86_init.c | 1 - 2 files changed, 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index acb91c19f318..cf5cd19b74e3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1296,30 +1296,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - memset(entry, 0, sizeof(*entry)); - - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d6f36c7594d7..066cdaa6503e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -145,6 +145,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .modify = native_io_apic_modify, .disable = native_disable_io_apic, .set_affinity = native_ioapic_set_affinity, - .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.3 From aa5cb97f14a2dd5aefabed6538c35ebc087d7c24 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:42 +0800 Subject: x86/irq: Remove x86_io_apic_ops.set_affinity and related interfaces Now there is no user of x86_io_apic_ops.set_affinity anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-6-git-send-email-jiang.liu@linux.intel.com --- arch/x86/kernel/apic/io_apic.c | 25 +------------------------ arch/x86/kernel/x86_init.c | 1 - 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf5cd19b74e3..9ef964512b86 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1787,29 +1787,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -int native_ioapic_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = apic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -2686,7 +2663,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - x86_io_apic_ops.set_affinity(idata, mask, false); + irq_set_affinity(irq, mask); } } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 066cdaa6503e..f7e8eab3a7c4 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,6 +144,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .set_affinity = native_ioapic_set_affinity, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.3 From ad66e1efc95e548598b032c1fe5bbc34f6460547 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:43 +0800 Subject: x86/irq: Remove x86_io_apic_ops.eoi_ioapic_pin and related interfaces Now there is no user of x86_io_apic_ops.eoi_ioapic_pin anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 20 ++++---------------- arch/x86/kernel/x86_init.c | 1 - 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ef964512b86..998fefad820e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -271,7 +271,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -void io_apic_eoi(unsigned int apic, unsigned int vector) +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -527,7 +527,7 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -void native_eoi_ioapic_pin(int apic, int pin, int vector) +static void __eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { io_apic_eoi(apic, vector); @@ -558,19 +558,7 @@ void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) - native_eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); -} - -void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) - x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, - cfg->vector); + __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -606,7 +594,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); - native_eoi_ioapic_pin(apic, pin, entry.vector); + __eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f7e8eab3a7c4..f612dc018fb6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,5 +144,4 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .eoi_ioapic_pin = native_eoi_ioapic_pin, }; -- cgit v1.2.3 From baac16952635445addaf397bad74e847db821d6d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:44 +0800 Subject: x86/irq: Remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ There's no user of irq_alloc_hwirqs(), irq_alloc_hwirq(), irq_free_hwirqs() and irq_free_hwirq() in x86 anymore, so remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ and related code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 633f03268d48..d0e5ea0fb947 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -659,40 +659,6 @@ void irq_force_complete_move(int irq) } #endif -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - free_remapped_irq(irq); - clear_irq_vector(irq, cfg); - irq_set_chip_data(irq, NULL); - free_irq_cfg(cfg); -} - static void __init print_APIC_field(int base) { int i; -- cgit v1.2.3 From 9880534989ba96faad26aebc01dcdb2c1b5793aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:46 +0800 Subject: irq_remapping: Clean up unsued code to support IOAPIC Now we have converted to hierarchical irqdomains, so clean up unused code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-10-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d0e5ea0fb947..37bb9e82b919 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -283,7 +283,6 @@ static void x86_vector_free_irqs(struct irq_domain *domain, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { - free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.3 From 4467715a44cca2fa41d25f3d32b737bd2331a8d9 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:53 +0800 Subject: x86/irq: Move irq_cfg.irq_2_pin into io_apic.c Now only io_apic.c accesses struct irq_cfg.irq_2_pin, so move irq_2_pin into struct mp_chip_data in io_apic.c to clean up struct irq_cfg further. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-17-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 164 +++++++++++++++++++---------------------- arch/x86/kernel/apic/vector.c | 3 - 2 files changed, 77 insertions(+), 90 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 998fefad820e..a1abdcf2cb5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,7 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + struct mp_chip_data { + struct list_head irq_2_pin; struct IO_APIC_route_entry entry; int trigger; int polarity; @@ -215,16 +221,6 @@ void mp_save_irq(struct mpc_intsrc *m) panic("Max # of irq sources exceeded!!\n"); } -struct irq_pin_list { - struct list_head list; - int apic, pin; -}; - -static struct irq_pin_list *alloc_irq_pin_list(int node) -{ - return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); -} - static void alloc_ioapic_saved_registers(int idx) { size_t size; @@ -379,16 +375,17 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { struct irq_pin_list *entry; /* don't allow duplicates */ - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - entry = alloc_irq_pin_list(node); + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -396,16 +393,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi } entry->apic = apic; entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); - list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } -static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); @@ -413,22 +410,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) } } -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { - if (__add_pin_to_irq_node(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(data, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -438,7 +436,7 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, } /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(cfg, node, newapic, newpin); + add_pin_to_irq_node(data, node, newapic, newpin); } static void __io_apic_modify_irq(struct irq_pin_list *entry, @@ -456,13 +454,13 @@ static void __io_apic_modify_irq(struct irq_pin_list *entry, final(entry); } -static void io_apic_modify_irq(struct irq_cfg *cfg, +static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __io_apic_modify_irq(entry, mask_and, mask_or, final); } @@ -478,39 +476,31 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static void mask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(struct irq_data *data) -{ - mask_ioapic(irqd_cfg(data)); -} - -static void __unmask_ioapic(struct irq_cfg *cfg) +static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_ioapic(cfg); + __unmask_ioapic(data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(struct irq_data *data) -{ - unmask_ioapic(irqd_cfg(data)); -} - /* * IO-APIC versions below 0x20 don't support EOI register. * For the record, here is the information about various versions: @@ -551,13 +541,13 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { unsigned long flags; struct irq_pin_list *entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1068,11 +1058,10 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { - struct irq_cfg *cfg = irqd_cfg(irq_data); - if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic_pin)) return -ENOMEM; } else { irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); @@ -1394,9 +1383,7 @@ static void __init print_IO_APIC(int ioapic_idx) void __init print_IO_APICs(void) { int ioapic_idx; - struct irq_cfg *cfg; unsigned int irq; - struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for_each_ioapic(ioapic_idx) @@ -1416,18 +1403,20 @@ void __init print_IO_APICs(void) printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; chip = irq_get_chip(irq); if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; - - cfg = irq_cfg(irq); - if (!cfg) + data = irq_get_chip_data(irq); + if (!data) continue; - if (list_empty(&cfg->irq_2_pin)) + if (list_empty(&data->irq_2_pin)) continue; + printk(KERN_DEBUG "IRQ%d ", irq); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); } @@ -1740,7 +1729,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(irqd_cfg(data)); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -1755,13 +1744,15 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) * races. */ -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, + struct mp_chip_data *data) { int apic, pin; struct irq_pin_list *entry; u8 vector = cfg->vector; + unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; apic = entry->apic; @@ -1778,13 +1769,13 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; int pin; @@ -1801,18 +1792,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { if (unlikely(masked)) { /* Only migrate the irq if the ack has been received. @@ -1841,31 +1831,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(cfg)) + if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { } #endif -static void ioapic_ack_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; bool masked; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(data, cfg); + masked = ioapic_irqd_mask(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1917,10 +1906,10 @@ static void ioapic_ack_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - eoi_ioapic_pin(cfg->vector, cfg); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(data, cfg, masked); + ioapic_irqd_unmask(irq_data, masked); } static void ioapic_ir_ack_level(struct irq_data *irq_data) @@ -1934,7 +1923,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) * EOI we use the pin number. */ ack_APIC_irq(); - eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); + eoi_ioapic_pin(data->entry.vector, data); } static int ioapic_set_affinity(struct irq_data *irq_data, @@ -1942,7 +1931,6 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; - unsigned int dest, irq = irq_data->irq; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1953,9 +1941,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - __target_IO_APIC_irq(irq, dest, cfg); + __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2116,10 +2102,11 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup); static int mp_alloc_timer_irq(int ioapic, int pin) { int irq = -1; - struct irq_alloc_info info; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); if (domain) { + struct irq_alloc_info info; + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.ioapic_id = mpc_ioapic_id(ioapic); info.ioapic_pin = pin; @@ -2141,7 +2128,9 @@ static int mp_alloc_timer_irq(int ioapic, int pin) */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2205,9 +2194,9 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic_irq(irq_get_chip_data(0)); } - irq_domain_activate_irq(irq_get_irq_data(0)); + irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2227,8 +2216,8 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - irq_domain_activate_irq(irq_get_irq_data(0)); + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -3044,6 +3033,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return ret; } + INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic_pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; @@ -3051,7 +3041,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); cfg = irqd_cfg(irq_data); - add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); if (info->ioapic_entry) mp_setup_entry(cfg, data, info->ioapic_entry); mp_register_handler(virq, data->trigger); @@ -3069,15 +3059,16 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct irq_cfg *cfg = irq_cfg(virq); struct irq_data *irq_data; + struct mp_chip_data *data; BUG_ON(nr_irqs != 1); irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { - __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); - WARN_ON(!list_empty(&cfg->irq_2_pin)); + WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); @@ -3089,10 +3080,9 @@ void mp_irqdomain_activate(struct irq_domain *domain, unsigned long flags; struct irq_pin_list *entry; struct mp_chip_data *data = irq_data->chip_data; - struct irq_cfg *cfg = irqd_cfg(irq_data); raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, data->entry); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 37bb9e82b919..af224e6774d8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -68,9 +68,6 @@ static struct irq_cfg *alloc_irq_cfg(int node) goto out_cfg; if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) goto out_domain; -#ifdef CONFIG_X86_IO_APIC - INIT_LIST_HEAD(&cfg->irq_2_pin); -#endif return cfg; out_domain: free_cpumask_var(cfg->domain); -- cgit v1.2.3 From 50a6ad84b2a2c971e76d57884d61a5a55d7c1601 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:54 +0800 Subject: x86/irq: Remove struct io_apic_irq_attr Now there's no user of struct io_apic_irq_attr anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a1abdcf2cb5f..76dc9f5bfdbc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2959,16 +2959,6 @@ int mp_ioapic_registered(u32 gsi_base) return 0; } -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) -{ - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { -- cgit v1.2.3 From 9a93d4736ec5ec322ec8f240a292c1a86cd0876d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:55 +0800 Subject: x86/irq: Remove x86_io_apic_ops.write and x86_io_apic_ops.modify x86_io_apic_ops.write is always set to native_io_apic_write(), and nobody overrides it. So get rid of the indirection by changing native_io_apic_write() as io_apic_write() and removing x86_io_apic_ops.write. Do the same for x86_io_apic_ops.modify and native_io_apic_modify(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- arch/x86/kernel/x86_init.c | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 76dc9f5bfdbc..d687a10ed3a2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -280,7 +280,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) return readl(&io_apic->data); } -void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -294,7 +295,8 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu * * Older SiS APIC requires we rewrite the index register */ -void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_modify(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f612dc018fb6..633f07845099 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -141,7 +141,5 @@ void arch_restore_msi_irqs(struct pci_dev *dev) struct x86_io_apic_ops x86_io_apic_ops = { .init = native_io_apic_init_mappings, .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, .disable = native_disable_io_apic, }; -- cgit v1.2.3 From ca1b88622e9c16df7b1e0a57e9c6c2300321bed4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2015 13:57:48 +0200 Subject: x86: Remove more unmodified io_apic_ops io_apic_ops.init() is either NULL, if IO-APIC support is disabled at compile time or native_io_apic_init_mappings(). No point to have that as we can achieve the same thing with an empty inline. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/setup.c | 3 +-- arch/x86/kernel/x86_init.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d687a10ed3a2..3029502b0a50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2687,7 +2687,7 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -void __init native_io_apic_init_mappings(void) +void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..8d04a7594a03 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 633f07845099..3cee10abf01d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -139,7 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) #endif struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, .read = native_io_apic_read, .disable = native_disable_io_apic, }; -- cgit v1.2.3 From 154d9e50e413ee144d48ccd6c402633ffbecbfff Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:56 +0800 Subject: x86/irq: Clean up io_apic.h Clean up io_apic.h by: 1) moving definition of struct mp_ioapic_gsi into io_apic.c 2) changing mp_pin_to_gsi() and mp_ioapic_gsi_routing() as static 3) removing unused MP_MAX_IOAPIC_PIN 4) removing useless forward declaration 5) removing useless comments Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3029502b0a50..4c7da8483398 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -63,7 +63,6 @@ #define for_each_ioapic_pin(idx, pin) \ for_each_ioapic((idx)) \ for_each_pin((idx), (pin)) - #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) @@ -92,6 +91,11 @@ struct mp_chip_data { bool isa_irq; }; +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; +}; + static struct ioapic { /* * # of IRQ routing registers @@ -122,7 +126,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx) return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { return &ioapics[ioapic_idx].gsi_config; } @@ -134,7 +138,7 @@ static inline int mp_ioapic_pin_count(int ioapic) return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; } -u32 mp_pin_to_gsi(int ioapic, int pin) +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } @@ -1153,8 +1157,7 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, - struct irq_alloc_info *info) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1719,7 +1722,6 @@ static int __init timer_irq_works(void) * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ - static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; @@ -1737,15 +1739,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, struct mp_chip_data *data) { -- cgit v1.2.3 From 0be275e3a5607b23f5132121bca22a10ee23aa99 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:57 +0800 Subject: x86/irq: Use cached IOAPIC entry instead of reading from hardware Use cached IOAPIC entry instead of reading data from IOAPIC hardware registers to improve performance. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 78 ++++++++++++------------------------------ 1 file changed, 21 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4c7da8483398..4fb347f01653 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -67,8 +67,13 @@ list_for_each_entry(entry, &head, list) /* - * Is the SiS APIC rmw bug present ? + * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes + * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC + * requires we rewrite the index register again where the read already set up + * the index register. + * The code to make use of sis_apic_bug has been removed, but we don't want to + * lose this knowledge. */ int sis_apic_bug = -1; @@ -293,22 +298,6 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static void io_apic_modify(unsigned int apic, unsigned int reg, - unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -445,29 +434,23 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void __io_apic_modify_irq(struct irq_pin_list *entry, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) -{ - unsigned int reg, pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); -} - static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { + union entry_union eu; struct irq_pin_list *entry; - for_each_irq_pin(entry, data->irq_2_pin) - __io_apic_modify_irq(entry, mask_and, mask_or, final); + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } } static void io_apic_sync(struct irq_pin_list *entry) @@ -1739,28 +1722,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, - struct mp_chip_data *data) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - - for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -1926,6 +1887,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; + struct irq_pin_list *entry; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1936,7 +1898,9 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, + data->entry); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3 From 1f934641294ca2e09016c689862378fbb15da4d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:29:58 +0800 Subject: x86/irq: Remove sis apic bug workaround The SiS apic bug workaround is now obsolete as we cache the register values for performance reasons. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4fb347f01653..9806f9605bc4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -18,6 +18,16 @@ * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. */ #include @@ -66,17 +76,6 @@ #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC - * requires we rewrite the index register again where the read already set up - * the index register. - * The code to make use of sis_apic_bug has been removed, but we don't want to - * lose this knowledge. - */ -int sis_apic_bug = -1; - static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; @@ -2320,20 +2319,6 @@ void __init setup_IO_APIC(void) ioapic_initialized = 1; } -/* - * Called after all the initialization is done. If we didn't find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; -- cgit v1.2.3 From a2cbbb47fd90ef1161ce22b099de5c6095f8365f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:59 +0800 Subject: x86/irq: Remove unused alloc_irq_and_cfg_at() There's no caller of alloc_irq_and_cfg_at() anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-23-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index af224e6774d8..51cd46bfc46e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -76,27 +76,6 @@ out_cfg: return NULL; } -struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) -{ - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; - } - - cfg = alloc_irq_cfg(node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; -} - static void free_irq_cfg(struct irq_cfg *cfg) { if (cfg) { -- cgit v1.2.3 From f970510cc55e41d21ca30feb56873aaeb57ec18d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:00 +0800 Subject: x86/irq: Make functions only used in vector.c static Function {assign|clear}_irq_vector() and apic_retrigger_irq() are only used in vector.c, so make them static. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 51cd46bfc46e..d52af4d805db 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -185,7 +185,8 @@ next: return err; } -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int assign_irq_vector(int irq, struct irq_cfg *cfg, + const struct cpumask *mask) { int err; unsigned long flags; @@ -196,7 +197,7 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) return err; } -void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct irq_cfg *cfg) { int cpu, vector; unsigned long flags; @@ -441,7 +442,7 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); unsigned long flags; -- cgit v1.2.3 From 68f9f4404d74f859dc84973db8731b41a51d929a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:01 +0800 Subject: x86/irq: Remove function apic_set_affinity() Now there's no user of apic_set_affinity(), so remove it. Also rename vector_set_affinity() to apic_set_affinity() for consistency. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 40 +++------------------------------------- 1 file changed, 3 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d52af4d805db..1aea62d60cf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -463,42 +463,8 @@ void apic_ack_edge(struct irq_data *data) ack_APIC_irq(); } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int irq = data->irq; - int err; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - -static int vector_set_affinity(struct irq_data *irq_data, - const struct cpumask *dest, bool force) +static int apic_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) { struct irq_cfg *cfg = irq_data->chip_data; int err, irq = irq_data->irq; @@ -523,7 +489,7 @@ static int vector_set_affinity(struct irq_data *irq_data, static struct irq_chip lapic_controller = { .irq_ack = apic_ack_edge, - .irq_set_affinity = vector_set_affinity, + .irq_set_affinity = apic_set_affinity, .irq_retrigger = apic_retrigger_irq, }; -- cgit v1.2.3 From c6c2002b744215810c770dd73f45da954bcfa9d5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:02 +0800 Subject: x86/irq: Move check of cfg->move_in_progress into send_cleanup_vector() Move check of cfg->move_in_progress into send_cleanup_vector() to prepare for simplifying struct irq_cfg. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1aea62d60cf2..0092a6e0d5ee 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -494,7 +494,7 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; @@ -512,6 +512,12 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } +void send_cleanup_vector(struct irq_cfg *cfg) +{ + if (cfg->move_in_progress) + __send_cleanup_vector(cfg); +} + asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -582,7 +588,7 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); + __send_cleanup_vector(cfg); } void irq_complete_move(struct irq_cfg *cfg) -- cgit v1.2.3 From 7f3262edcdf623296b514377d52911b115c7ab49 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:03 +0800 Subject: x86/irq: Move private data in struct irq_cfg into dedicated data structure Several fields in struct irq_cfg are private to vector.c, so move it into dedicated data structure. This helps to hide implementation details. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1416901802-24211-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Tested-by: Joerg Roedel --- arch/x86/kernel/apic/vector.c | 221 +++++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 102 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0092a6e0d5ee..60047495041c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -21,11 +21,18 @@ #include #include +struct apic_chip_data { + struct irq_cfg cfg; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 move_in_progress : 1; +}; + struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC -static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; #endif void lock_vector_lock(void) @@ -41,12 +48,7 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -struct irq_cfg *irq_cfg(unsigned int irq) -{ - return irqd_cfg(irq_get_irq_data(irq)); -} - -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) { if (!irq_data) return NULL; @@ -57,36 +59,48 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(int node) +struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +{ + struct apic_chip_data *data = apic_chip_data(irq_data); + + return data ? &data->cfg : NULL; +} + +struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg; + return irqd_cfg(irq_get_irq_data(irq)); +} - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *data; + + data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); + if (!data) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + goto out_data; + if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) goto out_domain; - return cfg; + return data; out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); + free_cpumask_var(data->domain); +out_data: + kfree(data); return NULL; } -static void free_irq_cfg(struct irq_cfg *cfg) +static void free_apic_chip_data(struct apic_chip_data *data) { - if (cfg) { - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (data) { + free_cpumask_var(data->domain); + free_cpumask_var(data->old_domain); + kfree(data); } } -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int __assign_irq_vector(int irq, struct apic_chip_data *d, + const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -104,7 +118,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if (cfg->move_in_progress) + if (d->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -112,26 +126,26 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - cpumask_clear(cfg->old_domain); + cpumask_clear(d->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; apic->vector_allocation_domain(cpu, tmp_mask, mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { + if (cpumask_subset(tmp_mask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) + if (cpumask_equal(tmp_mask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, tmp_mask); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); + cpumask_and(d->domain, d->domain, tmp_mask); break; } @@ -145,8 +159,8 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpumask_or(d->old_domain, d->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, d->old_domain); cpu = cpumask_first_and(tmp_mask, cpu_online_mask); continue; } @@ -162,15 +176,15 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); + if (d->cfg.vector) { + cpumask_copy(d->old_domain, d->domain); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); + d->cfg.vector = vector; + cpumask_copy(d->domain, tmp_mask); err = 0; break; } @@ -178,46 +192,46 @@ next: if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, - &cfg->dest_apicid); + err = apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid); } return err; } -static int assign_irq_vector(int irq, struct irq_cfg *cfg, +static int assign_irq_vector(int irq, struct apic_chip_data *data, const struct cpumask *mask) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); + err = __assign_irq_vector(irq, data, mask); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - BUG_ON(!cfg->vector); + BUG_ON(!data->cfg.vector); - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + vector = data->cfg.vector; + for_each_cpu_and(cpu, data->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - cfg->vector = 0; - cpumask_clear(cfg->domain); + data->cfg.vector = 0; + cpumask_clear(data->domain); - if (likely(!cfg->move_in_progress)) { + if (likely(!data->move_in_progress)) { raw_spin_unlock_irqrestore(&vector_lock, flags); return; } - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -226,7 +240,7 @@ static void clear_irq_vector(int irq, struct irq_cfg *cfg) break; } } - cfg->move_in_progress = 0; + data->move_in_progress = 0; raw_spin_unlock_irqrestore(&vector_lock, flags); } @@ -261,10 +275,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { clear_irq_vector(virq + i, irq_data->chip_data); - free_irq_cfg(irq_data->chip_data); + free_apic_chip_data(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs()) - legacy_irq_cfgs[virq + i] = NULL; + legacy_irq_data[virq + i] = NULL; #endif irq_domain_reset_irq_data(irq_data); } @@ -275,9 +289,9 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; + struct apic_chip_data *data; const struct cpumask *mask; struct irq_data *irq_data; - struct irq_cfg *cfg; int i, err; if (disable_apic) @@ -292,20 +306,20 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); #ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) - cfg = legacy_irq_cfgs[virq + i]; + if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) + data = legacy_irq_data[virq + i]; else #endif - cfg = alloc_irq_cfg(irq_data->node); - if (!cfg) { + data = alloc_apic_chip_data(irq_data->node); + if (!data) { err = -ENOMEM; goto error; } irq_data->chip = &lapic_controller; - irq_data->chip_data = cfg; + irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, cfg, mask); + err = assign_irq_vector(virq, data, mask); if (err) goto error; } @@ -349,22 +363,22 @@ int __init arch_probe_nr_irqs(void) static void init_legacy_irqs(void) { int i, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); - BUG_ON(!cfg); + data = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!data); /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - irq_set_chip_data(i, cfg); + data->cfg.vector = IRQ0_VECTOR + i; + cpumask_setall(data->domain); + irq_set_chip_data(i, data); } } #else @@ -390,7 +404,7 @@ static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ int irq, vector; - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * vector_lock will make sure that we don't run into irq vector @@ -400,13 +414,13 @@ static void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!data) continue; - if (!cpumask_test_cpu(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, data->domain)) continue; - vector = cfg->vector; + vector = data->cfg.vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -415,8 +429,8 @@ static void __setup_vector_irq(int cpu) if (irq <= VECTOR_UNDEFINED) continue; - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); @@ -442,15 +456,15 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -static int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct apic_chip_data *data = apic_chip_data(irq_data); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + cpu = cpumask_first_and(data->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -466,7 +480,7 @@ void apic_ack_edge(struct irq_data *data) static int apic_set_affinity(struct irq_data *irq_data, const struct cpumask *dest, bool force) { - struct irq_cfg *cfg = irq_data->chip_data; + struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; if (!config_enabled(CONFIG_SMP)) @@ -475,11 +489,11 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, cfg, dest); + err = assign_irq_vector(irq, data, dest); if (err) { struct irq_data *top = irq_get_irq_data(irq); - if (assign_irq_vector(irq, cfg, top->affinity)) + if (assign_irq_vector(irq, data, top->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } @@ -494,28 +508,31 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct apic_chip_data *data) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + for_each_cpu_and(i, data->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } - cfg->move_in_progress = 0; + data->move_in_progress = 0; } void send_cleanup_vector(struct irq_cfg *cfg) { - if (cfg->move_in_progress) - __send_cleanup_vector(cfg); + struct apic_chip_data *data; + + data = container_of(cfg, struct apic_chip_data, cfg); + if (data->move_in_progress) + __send_cleanup_vector(data); } asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) @@ -531,7 +548,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) int irq; unsigned int irr; struct irq_desc *desc; - struct irq_cfg *cfg; + struct apic_chip_data *data; irq = __this_cpu_read(vector_irq[vector]); @@ -542,8 +559,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) if (!desc) continue; - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(&desc->irq_data); + if (!data) continue; raw_spin_lock(&desc->lock); @@ -552,10 +569,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) * Check if the irq migration is in progress. If so, we * haven't received the cleanup request yet for this irq. */ - if (cfg->move_in_progress) + if (data->move_in_progress) goto unlock; - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + if (vector == data->cfg.vector && + cpumask_test_cpu(me, data->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -581,14 +599,15 @@ unlock: static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; + struct apic_chip_data *data; - if (likely(!cfg->move_in_progress)) + data = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!data->move_in_progress)) return; me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - __send_cleanup_vector(cfg); + if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) + __send_cleanup_vector(data); } void irq_complete_move(struct irq_cfg *cfg) @@ -600,10 +619,8 @@ void irq_force_complete_move(int irq) { struct irq_cfg *cfg = irq_cfg(irq); - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); + if (cfg) + __irq_complete_move(cfg, cfg->vector); } #endif -- cgit v1.2.3 From 46176f39b1a6f457eae78999befbdf58e68555e7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:05 +0800 Subject: x86/irq, ACPI: Remove private function mp_register_gsi()/ mp_unregister_gsi() Function mp_register_gsi() is only called once, so fold it into caller acpi_register_gsi_ioapic(). Do the same for mp_unregister_gsi(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Link: http://lkml.kernel.org/r/1428978610-28986-29-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 57 ++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 21e460b3b360..91a10120ed10 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -400,42 +400,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ - int irq, node; - struct irq_alloc_info info; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; - polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; - node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - ioapic_set_alloc_attr(&info, node, trigger, polarity); - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); - if (irq < 0) - return irq; - - /* Don't set up the ACPI SCI because it's already set up */ - if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - return irq; -} - -static void mp_unregister_gsi(u32 gsi) -{ - int irq; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return; - - irq = mp_map_gsi_to_irq(gsi, 0, NULL); - if (irq > 0) - mp_unmap_irq(irq); -} - static struct irq_domain_ops acpi_irqdomain_ops = { .alloc = mp_irqdomain_alloc, .free = mp_irqdomain_free, @@ -662,10 +626,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { int irq = gsi; - #ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + mutex_lock(&acpi_ioapic_lock); - irq = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && + acpi_gbl_FADT.sci_interrupt != gsi) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -675,8 +650,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + int irq; + mutex_lock(&acpi_ioapic_lock); - mp_unregister_gsi(gsi); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); mutex_unlock(&acpi_ioapic_lock); #endif } -- cgit v1.2.3 From 335efdf57da39d3949c3ef9338de5737e85cbe52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:06 +0800 Subject: x86, ioapic: Use proper defines for the entry fields While looking at the printout issue, I stumbled more than once over the various 0/1 assignments which are either commented in strange ways or force to lookup the meaning. Use proper constants and fix the misleading comments. While at it remove pointless 0 assignments in native_disable_io_apic() which have no value for understanding the code. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 100 ++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9806f9605bc4..a63167f96126 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -356,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -517,7 +517,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = 1; + entry1.mask = IOAPIC_MASKED; entry1.trigger = IOAPIC_EDGE; __ioapic_write_entry(apic, pin, entry1); @@ -553,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -567,7 +567,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (!entry.trigger) { + if (entry.trigger == IOAPIC_EDGE) { entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } @@ -670,8 +670,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); } } @@ -773,11 +773,11 @@ static int EISA_ELCR(unsigned int irq) #endif -/* ISA interrupts are always polarity zero edge triggered, +/* ISA interrupts are always active high edge triggered, * when listed as conforming in the MP table. */ -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -787,11 +787,11 @@ static int EISA_ELCR(unsigned int irq) #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) -/* PCI interrupts are always polarity one level triggered, +/* PCI interrupts are always active low level triggered, * when listed as conforming in the MP table. */ -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) static int irq_polarity(int idx) { @@ -811,24 +811,24 @@ static int irq_polarity(int idx) break; case 1: /* high active */ { - polarity = 0; + polarity = IOAPIC_POL_HIGH; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } case 3: /* low active */ { - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } } @@ -870,7 +870,7 @@ static int irq_trigger(int idx) default: { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } } @@ -878,24 +878,24 @@ static int irq_trigger(int idx) break; case 1: /* edge */ { - trigger = 0; + trigger = IOAPIC_EDGE; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } case 3: /* level */ { - trigger = 1; + trigger = IOAPIC_LEVEL; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - trigger = 0; + trigger = IOAPIC_EDGE; break; } } @@ -939,11 +939,11 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic_polarity = polarity; } else { /* - * PCI interrupts are always polarity one level + * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = 1; - dst->ioapic_polarity = 1; + dst->ioapic_trigger = IOAPIC_LEVEL; + dst->ioapic_polarity = IOAPIC_POL_LOW; } } } @@ -1296,9 +1296,10 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry = ioapic_read_entry(apic, i); snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, entry.mask ? "disabled" : "enabled ", - entry.trigger ? "level" : "edge ", - entry.polarity ? "low " : "high", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", @@ -1306,7 +1307,9 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", - buf, entry.dest_mode ? "logical " : "physical", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", entry.dest, entry.delivery_mode); } } @@ -1476,15 +1479,12 @@ void native_disable_io_apic(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1494,7 +1494,6 @@ void native_disable_io_apic(void) if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); - } /* @@ -2018,12 +2017,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; - entry1.trigger = 0; + entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2911,9 +2910,9 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, data->polarity = info->ioapic_polarity; } else if (acpi_get_override_irq(gsi, &data->trigger, &data->polarity) < 0) { - /* PCI interrupts are always polarity one level triggered. */ - data->trigger = 1; - data->polarity = 1; + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; } } @@ -2925,15 +2924,16 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, entry->dest_mode = apic->irq_dest_mode; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; - entry->mask = 0; /* enable IRQ */ entry->trigger = data->trigger; entry->polarity = data->polarity; /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. */ - if (data->trigger) - entry->mask = 1; + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, -- cgit v1.2.3 From ab76085ec0858d4c2707ea0d036db00ef4aee8fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:07 +0800 Subject: x86,ioapic: Cleanup irq_trigger/polarity() These functions are full of pointless indentations, useless comments and even more useless printks. Clean them up. Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-31-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: x86@kernel.org Signed-off-by: Jiang Liu --- arch/x86/kernel/apic/io_apic.c | 138 +++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 88 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a63167f96126..9fcca68b183d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -796,45 +796,47 @@ static int EISA_ELCR(unsigned int irq) static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; - int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = IOAPIC_POL_HIGH; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } - case 3: /* low active */ - { - polarity = IOAPIC_POL_LOW; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } + switch (mp_irqs[idx].irqflag & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case 1: + return IOAPIC_POL_HIGH; + case 2: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; } - return polarity; } +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; +} +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif + static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; @@ -843,63 +845,23 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#ifdef CONFIG_EISA - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - default: - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = IOAPIC_EDGE; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - case 3: /* level */ - { - trigger = IOAPIC_LEVEL; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_EDGE; - break; - } + switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case 1: + return IOAPIC_EDGE; + case 2: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; } - return trigger; } void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, -- cgit v1.2.3 From f7a0c78669ee79443a91ea89652766c1be8d9e04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:08 +0800 Subject: x86: Cleanup irq_domain ops We have 3 identical copies of the ioapic domain ops for acpi, mpparse, and sfi. Have a global one in the io_apic code and be done with it. To avoid include hell in io_apic.h, create a private irqdomain header and include the generic irqdomain header from there. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 13 +++---------- arch/x86/kernel/apic/io_apic.c | 9 ++++++++- arch/x86/kernel/devicetree.c | 12 ++++++------ arch/x86/kernel/mpparse.c | 9 +-------- 4 files changed, 18 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 91a10120ed10..cb9f6f12246b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,12 +31,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -400,20 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static struct irq_domain_ops acpi_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic = (struct acpi_madt_io_apic *)header; @@ -764,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) u64 addr; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fcca68b183d..845dc0df2002 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -41,13 +41,13 @@ #include #include #include -#include #include #include #include /* time_after() */ #include #include +#include #include #include #include @@ -2995,3 +2995,10 @@ int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; } + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 05103d398ed7..5ee771859b6f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include +#include #include #include #include @@ -216,11 +216,11 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } -const struct irq_domain_ops ioapic_irq_domain_ops = { - .alloc = dt_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index aa4feee74dbe..30ca7607cbbb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,8 +19,8 @@ #include #include #include -#include +#include #include #include #include @@ -113,13 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m) pr_warn("Unknown bustype %s - ignoring\n", str); } -static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static void __init MP_ioapic_info(struct mpc_ioapic *m) { struct ioapic_domain_cfg cfg = { -- cgit v1.2.3 From d746d1ebd30c48562a3fb512ab18d5822f137820 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:09 +0800 Subject: x86/irq: Move irqdomain specific code into asm/irqdomain.h Now we have dedicated asm/irqdomain.h, so move irqdomain specific code into it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428978610-28986-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 2 +- arch/x86/kernel/apic/msi.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/hpet.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 1cae104415ea..341e99be42b8 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 109584261c4e..58fde664e7c0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 60047495041c..ad786f8a7cc7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e3bc18080052..e2449cf38b06 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,8 +11,8 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From f7fa7aeeecb7a9abdd5f5d069a71ffb3e99a2a07 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:10 +0800 Subject: x86/irq: Avoid memory allocation in __assign_irq_vector() Function __assign_irq_vector() is protected by vector_lock, so use a global temporary cpu_mask to avoid allocating/freeing cpu_mask. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ad786f8a7cc7..1c7dd42b98c1 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -30,6 +30,7 @@ struct apic_chip_data { struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_cpumask; static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; @@ -116,14 +117,10 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; - cpumask_var_t tmp_mask; if (d->move_in_progress) return -EBUSY; - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; cpumask_clear(d->old_domain); @@ -131,21 +128,22 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask, mask); + apic->vector_allocation_domain(cpu, vector_cpumask, mask); - if (cpumask_subset(tmp_mask, d->domain)) { + if (cpumask_subset(vector_cpumask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, d->domain)) + if (cpumask_equal(vector_cpumask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(d->old_domain, d->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, + vector_cpumask); d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); - cpumask_and(d->domain, d->domain, tmp_mask); + cpumask_and(d->domain, d->domain, vector_cpumask); break; } @@ -159,16 +157,18 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(d->old_domain, d->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, d->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + cpumask_or(d->old_domain, d->old_domain, + vector_cpumask); + cpumask_andnot(vector_cpumask, mask, d->old_domain); + cpu = cpumask_first_and(vector_cpumask, + cpu_online_mask); continue; } if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; @@ -181,14 +181,13 @@ next: d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; d->cfg.vector = vector; - cpumask_copy(d->domain, tmp_mask); + cpumask_copy(d->domain, vector_cpumask); err = 0; break; } - free_cpumask_var(tmp_mask); if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ @@ -397,6 +396,8 @@ int __init arch_early_irq_init(void) arch_init_msi_domain(x86_vector_domain); arch_init_htirq_domain(x86_vector_domain); + BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + return arch_early_ioapic_init(); } -- cgit v1.2.3 From eb18cf55c299d2ac5c8b5421c58b6c582a044475 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 11:10:11 +0200 Subject: x86: Constify irqdomain ops Nothing changes those ops. Make the initializers readable while at it. Reported-by: Krzysztof Kozlowski Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 10 +++++----- arch/x86/kernel/apic/vector.c | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 341e99be42b8..ae50d3454d78 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -143,11 +143,11 @@ static void htirq_domain_deactivate(struct irq_domain *domain, write_ht_irq_msg(irq_data->irq, &msg); } -static struct irq_domain_ops htirq_domain_ops = { - .alloc = htirq_domain_alloc, - .free = htirq_domain_free, - .activate = htirq_domain_activate, - .deactivate = htirq_domain_deactivate, +static const struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, }; void arch_init_htirq_domain(struct irq_domain *parent) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1c7dd42b98c1..426496862be0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -330,9 +330,9 @@ error: return err; } -static struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, }; int __init arch_probe_nr_irqs(void) -- cgit v1.2.3 From 781674fc33adf0d975a361e111bb45804356aa23 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 4 May 2015 17:58:00 +0200 Subject: x86/x2apic: Acpi_gbl_FADT existence depends on CONFIG_ACPI If ACPI is disabled, acpi_gbl_FADT is not available, and and the build breaks. Signed-off-by: Jan Kiszka Link: http://lkml.kernel.org/r/55479708.2000104@siemens.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/x2apic_phys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6fae733e9194..3ffd925655e0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static bool x2apic_fadt_phys(void) { +#ifdef CONFIG_ACPI if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); return true; } +#endif return false; } -- cgit v1.2.3 From f5d6a52f511157c7476590532a23b5664b1ed877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Mon, 4 May 2015 11:42:34 +0200 Subject: x86/smpboot: Skip delays during SMP initialization similar to Xen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the per-CPU delays during SMP initialization, which seems to be possible on newer architectures with an x2APIC. Xen does this since 2011. In fact, this commit is basically a combination of the following Xen commits. The first removes the delays, the second fixes an issue with the removal: commit 68fce206f6dba9981e8322269db49692c95ce250 Author: Tim Deegan Date: Tue Jul 19 14:13:01 2011 +0100 x86: Remove timeouts from INIT-SIPI-SIPI sequence when using x2apic. Some of the timeouts are pointless since they're waiting for the ICR to ack the IPI delivery and that doesn't happen on x2apic. The others should be benign (and are suggested in the SDM) but removing them makes AP bringup much more reliable on some test boxes. Signed-off-by: Tim Deegan commit f12ee533150761df5a7099c83f2a5fa6c07d1187 Author: Gang Wei Date: Thu Dec 29 10:07:54 2011 +0000 X86: Add a delay between INIT & SIPIs for tboot AP bring-up in X2APIC case Without this delay, Xen could not bring APs up while working with TXT/tboot, because tboot needs some time in APs to handle INIT before becoming ready for receiving SIPIs (this delay was removed as part of c/s 23724 by Tim Deegan). Signed-off-by: Gang Wei Acked-by: Keir Fraser Acked-by: Tim Deegan Committed-by: Tim Deegan Signed-off-by: Jan H. Schönherr Cc: Anthony Liguori Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 61 +++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..63b46414c80c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -555,7 +555,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { - unsigned long send_status, accept_status = 0; + unsigned long send_status = 0, accept_status = 0; int maxlvt, num_starts, j; maxlvt = lapic_get_maxlvt(); @@ -580,22 +580,34 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + if (!cpu_has_x2apic) { + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(10); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); + mb(); + atomic_set(&init_deasserted, 1); + } else if (tboot_enabled()) { + /* + * With tboot AP is actually spinning in a mini-guest before + * receiving INIT. Upon receiving INIT ipi, AP need time to + * VMExit, update VMCS to tracking SIPIs and VMResume. + * + * While AP is in root mode handling the INIT the CPU will drop + * any SIPIs + */ + udelay(10); + } /* * Should we send STARTUP IPIs ? @@ -637,20 +649,23 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + if (!cpu_has_x2apic) { + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + } - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); -- cgit v1.2.3 From afdf344e08fbec28ab2204a626fa1f260dcb68be Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:53 -0500 Subject: x86/mce/amd: Factor out logging mechanism Refactor the code here to setup struct mce and call mce_log() to log the error. We're going to reuse this in a later patch as part of the deferred error interrupt enablement. No functional change is introduced. Suggested-by: Borislav Petkov Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-2-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..5f25de20db36 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -264,6 +264,27 @@ init: } } +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + if (threshold_err) + m.misc = misc; + + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + /* * APIC Interrupt Handler */ @@ -273,12 +294,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +342,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* -- cgit v1.2.3 From 6e6e746e33e9555a7fce159d25314c9df3bcda93 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:54 -0500 Subject: x86/mce/amd: Collect valid address before logging an error amd_decode_mce() needs value in m->addr so it can report the error address correctly. This should be setup in __log_error() before we call mce_log(). We do this because the error address is an important bit of information which should be conveyed to userspace. The correct output then reports proper address, like this: [Hardware Error]: Corrected error, no action required. [Hardware Error]: CPU:0 (15:60:0) MC0_STATUS [-|CE|-|-|AddrV|-|-|CECC]: 0x840041000028017b [Hardware Error]: MC0 Error Address: 0x00001f808f0ff040 Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5f25de20db36..607075726e10 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -277,11 +277,14 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) m.status = status; m.bank = bank; + if (threshold_err) m.misc = misc; - mce_log(&m); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + mce_log(&m); wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } -- cgit v1.2.3 From 7559e13fb4abe7880dfaf985d6a1630ca90a67ce Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:55 -0500 Subject: x86/mce: Add support for deferred errors on AMD Deferred errors indicate error conditions that were not corrected, but those errors have not been consumed yet. They require no action from S/W (or action is optional). These errors provide info about a latent uncorrectable MCE that can occur when a poisoned data is consumed by the processor. Newer AMD processors can generate deferred errors and can be configured to generate APIC interrupts on such events. SUCCOR stands for S/W UnCorrectable error COntainment and Recovery. It indicates support for data poisoning in HW and deferred error interrupts. Add new bitfield to mce_vendor_flags for this. We use this to verify presence of deferred error interrupts before we enable them in mce_amd.c While at it, clarify comments in mce_vendor_flags to provide an indication of usages of the bitfields. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-4-git-send-email-Aravind.Gopalakrishnan@amd.com [ beef up commit message, do CPUID(8000_0007) only once. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..521e5016aca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1637,10 +1637,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } -- cgit v1.2.3 From 24fd78a81f6d3fe7f7a440c8629f9c52cd5f830e Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:56 -0500 Subject: x86/mce/amd: Introduce deferred error interrupt handler Deferred errors indicate error conditions that were not corrected, but require no action from S/W (or action is optional).These errors provide info about a latent UC MCE that can occur when a poisoned data is consumed by the processor. Processors that report these errors can be configured to generate APIC interrupts to notify OS about the error. Provide an interrupt handler in this patch so that OS can catch these errors as and when they happen. Currently, we simply log the errors and exit the handler as S/W action is not mandated. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-5-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 93 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 ++ arch/x86/kernel/irq.c | 6 +++ arch/x86/kernel/irqinit.c | 4 ++ arch/x86/kernel/traps.c | 5 ++ 5 files changed, 113 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 607075726e10..2e7ebe7e1e80 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -12,6 +12,8 @@ * - added support for AMD Family 0x10 processors * May 2012 * - major scrubbing + * May 2015 + * - add support for deferred error interrupts (Aravind Gopalakrishnan) * * All MC4_MISCi registers are shared between multi-cores */ @@ -32,6 +34,7 @@ #include #include #include +#include #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +50,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +70,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -205,6 +222,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -262,6 +312,9 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); } static void __log_error(unsigned int bank, bool threshold_err, u64 misc) @@ -288,6 +341,46 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } +} + /* * APIC Interrupt Handler */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 02c2eff7478d..12aea85fe738 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -935,6 +935,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt #endif +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR \ + deferred_error_interrupt smp_deferred_error_interrupt +#endif + #ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..590ed6c1bf51 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -116,6 +116,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..d7ec6e7b2b5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..68b1d5979a46 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -827,6 +827,11 @@ asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } +asmlinkage __visible void __attribute__((weak)) +smp_deferred_error_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task -- cgit v1.2.3 From 868c00bb5980653c44d931384baa2c7f1bde81ef Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:58 -0500 Subject: x86/mce/amd: Rename setup_APIC_mce 'setup_APIC_mce' doesn't give us an indication of why we are going to program LVT. Make that explicit by renaming it to setup_APIC_mce_threshold so we know. No functional change is introduced. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-7-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2e7ebe7e1e80..70e1bf6f784d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -213,7 +213,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -302,7 +302,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) -- cgit v1.2.3 From 8cd161b1f755decd8b7f6c9c7144119281fe11a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 11:38:08 +0200 Subject: x86/traps: Remove superfluous weak definitions and dead code Those were leftovers of the x86 merge, see 081f75bbdc86 ("traps: x86: make traps_32.c and traps_64.c equal") for example and are not needed now. Signed-off-by: Borislav Petkov --- arch/x86/kernel/traps.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 68b1d5979a46..2768bb663f94 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,23 +813,6 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) -smp_deferred_error_interrupt(void) -{ } /* -- cgit v1.2.3 From 3490c0e45f7e5a5b1e5c62e4c60b0e55b2e75e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 12:06:43 +0200 Subject: x86/mce/amd: Zap changelog It is useless and git history has it all detailed anyway. Update copyright while at it. Signed-off-by: Borislav Petkov Cc: Aravind Gopalakrishnan --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 70e1bf6f784d..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,21 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * May 2015 - * - add support for deferred error interrupts (Aravind Gopalakrishnan) - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include #include -- cgit v1.2.3 From dde74f2e4a4447ef838c57e407f7139de3df68cb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:51 +0200 Subject: x86/asm/entry/64: Tidy up JZ insns after TESTs After TESTs, use logically correct JZ/JNZ mnemonics instead of JE/JNE. This doesn't change code. Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e952f6bf1d6d..8f8b22a361df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -666,7 +666,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ testl $3, CS-RBP(%rsp) - je 1f + jz 1f SWAPGS 1: /* @@ -721,7 +721,7 @@ ret_from_intr: CFI_ADJUST_CFA_OFFSET RBP testl $3,CS(%rsp) - je retint_kernel + jz retint_kernel /* Interrupt came from user space */ GET_THREAD_INFO(%rcx) @@ -1310,7 +1310,7 @@ ENTRY(error_entry) SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) - je error_kernelspace + jz error_kernelspace error_swapgs: SWAPGS error_sti: @@ -1361,7 +1361,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jnz retint_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi -- cgit v1.2.3 From 03335e95e27fc1f2b17b05b27342ad76986b3cf0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:52 +0200 Subject: x86/asm/entry/64: Clean up usage of TEST insns By the nature of TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem" This results in shorter insns, because TEST insn has no sign-entending byte-immediate forms unlike other ALU ops. text data bss dec hex filename 11674 0 0 11674 2d9a entry_64.o.before 11658 0 0 11658 2d8a entry_64.o Changes in object code: - f7 84 24 88 00 00 00 03 00 00 00 testl $0x3,0x88(%rsp) + f6 84 24 88 00 00 00 03 testb $0x3,0x88(%rsp) - f7 44 24 68 03 00 00 00 testl $0x3,0x68(%rsp) + f6 44 24 68 03 testb $0x3,0x68(%rsp) - f7 84 24 90 00 00 00 03 00 00 00 testl $0x3,0x90(%rsp) + f6 84 24 90 00 00 00 03 testb $0x3,0x90(%rsp) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8f8b22a361df..60705b032521 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -601,7 +601,7 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS - testl $3,CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) # from kernel_thread? /* * By the time we get here, we have no idea whether our pt_regs, @@ -665,7 +665,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - testl $3, CS-RBP(%rsp) + testb $3, CS-RBP(%rsp) jz 1f SWAPGS 1: @@ -720,7 +720,7 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP - testl $3,CS(%rsp) + testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ @@ -968,7 +968,7 @@ ENTRY(\sym) .if \paranoid .if \paranoid == 1 CFI_REMEMBER_STATE - testl $3, CS(%rsp) /* If coming from userspace, switch */ + testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif call paranoid_entry @@ -1309,7 +1309,7 @@ ENTRY(error_entry) SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 xorl %ebx,%ebx - testl $3,CS+8(%rsp) + testb $3, CS+8(%rsp) jz error_kernelspace error_swapgs: SWAPGS @@ -1606,7 +1606,6 @@ end_repeat_nmi: je 1f movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit v1.2.3 From 63332a8455d8310b77d38779c6c21a660a8d9feb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:33 +0200 Subject: x86/entry: Stop using PER_CPU_VAR(kernel_stack) PER_CPU_VAR(kernel_stack) is redundant: - On the 64-bit build, we can use PER_CPU_VAR(cpu_tss + TSS_sp0). - On the 32-bit build, we can use PER_CPU_VAR(cpu_current_top_of_stack). PER_CPU_VAR(kernel_stack) will be deleted by a separate change. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7423e3e2f5c5..c13b86b40176 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ -- cgit v1.2.3 From fed7c3f0f750f225317828d691e9eb76eec887b3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:34 +0200 Subject: x86/entry: Remove unused 'kernel_stack' per-cpu variable Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ---- arch/x86/kernel/process_32.c | 5 +---- arch/x86/kernel/process_64.c | 3 --- arch/x86/kernel/smpboot.c | 2 -- 4 files changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..6bec0b55863e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1155,10 +1155,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8ed2106b06da..a99900cedc22 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -302,13 +302,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddfdbf74f174..82134506faa8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -409,9 +409,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* * Now maybe reload the debug registers and handle I/O bitmaps */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..023cccf5a4ae 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -792,8 +792,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; } /* -- cgit v1.2.3 From 3a23208e69679597e767cf3547b1a30dd845d9b5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:35 +0200 Subject: x86/entry: Define 'cpu_current_top_of_stack' for 64-bit code 32-bit code has PER_CPU_VAR(cpu_current_top_of_stack). 64-bit code uses somewhat more obscure: PER_CPU_VAR(cpu_tss + TSS_sp0). Define the 'cpu_current_top_of_stack' macro on CONFIG_X86_64 as well so that the PER_CPU_VAR(cpu_current_top_of_stack) expression can be used in both 32-bit and 64-bit code. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c13b86b40176..09c3f9e0e07e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ -- cgit v1.2.3 From c5bde906d2916d214d78cd8b67d665bf09867033 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:50 -0400 Subject: x86/irq: Merge irq_regs & irq_stat Move irq_regs and irq_stat definitions to irq.c. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 6 ++++++ arch/x86/kernel/irq_32.c | 6 ------ arch/x86/kernel/irq_64.c | 6 ------ 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..fe2ed8bb507b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,12 @@ #define CREATE_TRACE_POINTS #include +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f9fd86a7fcc7..cd74f5978ab9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -21,12 +21,6 @@ #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - #ifdef CONFIG_DEBUG_STACKOVERFLOW int sysctl_panic_on_stackoverflow __read_mostly; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 394e643d7830..bc4604e500a3 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,12 +20,6 @@ #include #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - int sysctl_panic_on_stackoverflow; /* -- cgit v1.2.3 From 51bb92843edcba5a58138cad25ced97923048add Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:52 -0400 Subject: x86/asm/entry: Remove SYSCALL_VECTOR Use IA32_SYSCALL_VECTOR for both compat and native. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..5e0791f9d3dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -997,8 +997,8 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - set_bit(SYSCALL_VECTOR, used_vectors); + set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif /* -- cgit v1.2.3 From 8b455e6577f325289cf2d1b20f493b2fe5c6c316 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:53 -0400 Subject: x86/asm/entry/irq: Clean up IRQn_VECTOR macros Since the ISA irqs are in a single block, use ISA_IRQ_VECTOR(irq) instead of individual macros. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/i8259.c | 8 ++++---- arch/x86/kernel/irqinit.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..e01e4117188a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -258,11 +258,11 @@ int __init arch_early_ioapic_init(void) /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + * ISA_IRQ_VECTOR(irq) for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; + cfg->vector = ISA_IRQ_VECTOR(i); cpumask_setall(cfg->domain); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..82d44c314a3f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -314,7 +314,7 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; __setup_vector_irq(cpu); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index e7cc5370cd2f..16cb827a5b27 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); @@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi) outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..dc1e08d23552 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -86,7 +86,7 @@ void __init init_IRQ(void) int i; /* - * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If * these IRQ's are handled by more mordern controllers like IO-APIC, @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; x86_init.irqs.intr_init(); } -- cgit v1.2.3 From f21262b8e092a770e39fbd405cc18a0247c3af68 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 11 May 2015 10:15:46 +0200 Subject: x86/alternatives: Switch AMD F15h and later to the P6 NOPs Software optimization guides for both F15h and F16h cite those NOPs as the optimal ones. A microbenchmark confirms that actually even older families are better with the single-insn NOPs so switch to them for the alternatives. Cycles count below includes the loop overhead of the measurement but that overhead is the same with all runs. F10h, revE: ----------- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 288.212282 cycles 66 90 288.220840 cycles 66 66 90 288.219447 cycles 66 66 66 90 288.223204 cycles 66 66 90 66 90 571.393424 cycles 66 66 90 66 66 90 571.374919 cycles 66 66 66 90 66 66 90 572.249281 cycles 66 66 66 90 66 66 66 90 571.388651 cycles P6: 90 288.214193 cycles 66 90 288.225550 cycles 0f 1f 00 288.224441 cycles 0f 1f 40 00 288.225030 cycles 0f 1f 44 00 00 288.233558 cycles 66 0f 1f 44 00 00 324.792342 cycles 0f 1f 80 00 00 00 00 325.657462 cycles 0f 1f 84 00 00 00 00 00 430.246643 cycles F14h: ---- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 510.404890 cycles 66 90 510.432117 cycles 66 66 90 510.561858 cycles 66 66 66 90 510.541865 cycles 66 66 90 66 90 1014.192782 cycles 66 66 90 66 66 90 1014.226546 cycles 66 66 66 90 66 66 90 1014.334299 cycles 66 66 66 90 66 66 66 90 1014.381205 cycles P6: 90 510.436710 cycles 66 90 510.448229 cycles 0f 1f 00 510.545100 cycles 0f 1f 40 00 510.502792 cycles 0f 1f 44 00 00 510.589517 cycles 66 0f 1f 44 00 00 510.611462 cycles 0f 1f 80 00 00 00 00 511.166794 cycles 0f 1f 84 00 00 00 00 00 511.651641 cycles F15h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 243.128396 cycles 66 90 243.129883 cycles 66 66 90 243.131631 cycles 66 66 66 90 242.499324 cycles 66 66 90 66 90 481.829083 cycles 66 66 90 66 66 90 481.884413 cycles 66 66 66 90 66 66 90 481.851446 cycles 66 66 66 90 66 66 66 90 481.409220 cycles P6: 90 243.127026 cycles 66 90 243.130711 cycles 0f 1f 00 243.122747 cycles 0f 1f 40 00 242.497617 cycles 0f 1f 44 00 00 245.354461 cycles 66 0f 1f 44 00 00 361.930417 cycles 0f 1f 80 00 00 00 00 362.844944 cycles 0f 1f 84 00 00 00 00 00 480.514948 cycles F16h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 507.793298 cycles 66 90 507.789636 cycles 66 66 90 507.826490 cycles 66 66 66 90 507.859075 cycles 66 66 90 66 90 1008.663129 cycles 66 66 90 66 66 90 1008.696259 cycles 66 66 66 90 66 66 90 1008.692517 cycles 66 66 66 90 66 66 66 90 1008.755399 cycles P6: 90 507.795232 cycles 66 90 507.794761 cycles 0f 1f 00 507.834901 cycles 0f 1f 40 00 507.822629 cycles 0f 1f 44 00 00 507.838493 cycles 66 0f 1f 44 00 00 507.908597 cycles 0f 1f 80 00 00 00 00 507.946417 cycles 0f 1f 84 00 00 00 00 00 507.954960 cycles Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431332153-18566-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aef653193160..b0932c4341b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -227,6 +227,15 @@ void __init arch_init_ideal_nops(void) #endif } break; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + /* fall through */ + default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; -- cgit v1.2.3 From cd2f6a5a4704a359635eb34919317052e6a96ba7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 May 2015 10:15:52 +0200 Subject: x86/mm/mtrr: Remove incorrect address check in __mtrr_type_lookup() __mtrr_type_lookup() checks MTRR fixed ranges when mtrr_state.have_fixed is set and start is less than 0x100000. However, the 'else if (start < 0x1000000)' in the code checks with an incorrect address as it has an extra-zero in the address. The code still runs correctly as this check is meaningless, though. This patch replaces the incorrect address check with 'else' with no condition. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1427234921-19737-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1431332153-18566-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..5b239679cfc9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -137,7 +137,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) idx = 1 * 8; idx += ((start - 0x80000) >> 14); return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { + } else { idx = 3 * 8; idx += ((start - 0xC0000) >> 12); return mtrr_state.fixed_ranges[idx]; -- cgit v1.2.3 From d68921f9bd148359e6d01c84aaa2e32bfbd82970 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:09 -0400 Subject: x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No change to default behavior. Replace the hard-coded mdelay(10) in cpu_up() with a variable udelay, that is set to a defined default -- rather than a magic number. Add a boot-time override, "cpu_init_udelay=N" Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2fe8e6c798e8def271122f62df9bbf58dc283e2a.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51203f60587f..0629a8e513af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -513,6 +513,27 @@ void __inquire_remote_apic(int apicid) } } +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UDELAY_10MS_DEFAULT; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -584,7 +605,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.3 From 1a744cb356c57303fc97eb15a298032170f841fa Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:10 -0400 Subject: x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern processor familes do not require the 10ms delay in cpu_up() to de-assert INIT. This speeds up boot and resume by 10ms per (application) processor. Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/021ce30c88f216ad39686646421194dc25671e55.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0629a8e513af..85bd6aad8c74 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -521,6 +521,7 @@ void __inquire_remote_apic(int apicid) * many cores and don't require that delay. * * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. */ #define UDELAY_10MS_DEFAULT 10000 @@ -534,6 +535,18 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UDELAY_10MS_DEFAULT) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + init_udelay = 0; +} + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -1210,6 +1223,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); } void arch_enable_nonboot_cpus_begin(void) -- cgit v1.2.3 From 853b160aaafbe27d6304c8832bb7340d57c6b04e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2015 08:40:49 +0200 Subject: Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huang Ying reported x86 boot hangs due to this commit. Turns out that the change, despite its changelog, does more than just change timeouts: it also changes the way we assert/deassert INIT via the APIC_DM_INIT IPI, in the x2apic case it skips the deassert step. This is historically fragile code and the patch did not improve it, so revert these changes. This commit: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") independently removes the worst of the delays (the 10 msec delay). The remaining delays can be addressed one by one, combined with careful testing. Reported-by: Huang Ying Cc: Anthony Liguori Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85bd6aad8c74..b9aaa3930b2f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -614,34 +614,22 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - if (!cpu_has_x2apic) { - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + mdelay(init_udelay); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); - } else if (tboot_enabled()) { - /* - * With tboot AP is actually spinning in a mini-guest before - * receiving INIT. Upon receiving INIT ipi, AP need time to - * VMExit, update VMCS to tracking SIPIs and VMResume. - * - * While AP is in root mode handling the INIT the CPU will drop - * any SIPIs - */ - udelay(10); - } + mb(); + atomic_set(&init_deasserted, 1); /* * Should we send STARTUP IPIs ? @@ -683,22 +671,20 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - if (!cpu_has_x2apic) { - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - } + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); -- cgit v1.2.3 From 4a00c95dcdba45c9592af2e908c0816fd54f5544 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 11 May 2015 18:56:49 +0900 Subject: x86/hpet: Pass proper pointer to irq_alloc_info Fix the following oops: hpet_msi_get_hwirq+0x1f/0x27 msi_domain_alloc+0x35/0xfe ? trace_hardirqs_on_caller+0x16c/0x188 irq_domain_alloc_irqs_recursive+0x51/0x95 __irq_domain_alloc_irqs+0x151/0x223 hpet_assign_irq+0x5d/0x68 hpet_msi_capability_lookup+0x121/0x1cb ? hpet_enable+0x2b4/0x2b4 hpet_late_init+0x5f/0xf2 ? hpet_enable+0x2b4/0x2b4 do_one_initcall+0x184/0x199 kernel_init_freeable+0x1af/0x237 ? rest_init+0x13a/0x13a kernel_init+0xe/0xd4 ret_from_fork+0x3f/0x70 ? rest_init+0x13a/0x13a Since 3cb96f0c9733 ('x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains') hpet_msi_capability_lookup() uses hpet_assign_irq(). The latter initializes irq_alloc_info on stack, but passes a NULL pointer to irq_domain_alloc_irqs(), which causes a NULL pointer dereference later in hpet_msi_get_hwirq(). Pass the pointer to the irq_alloc_info irq_domain_alloc_irqs(). Fixes: 3cb96f0c9733 'x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains' Signed-off-by: Sergey Senozhatsky Reviewed-by: Jiang Liu Cc: Sergey Senozhatsky Link: http://lkml.kernel.org/r/20150512041444.GA1094@swordfish Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 58fde664e7c0..ef516afa20bb 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -351,6 +351,6 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, info.hpet_id = hpet_dev_id(domain); info.hpet_index = dev_num; - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } #endif -- cgit v1.2.3 From 486ca539caa082c7f2929c207af1b3ce2a304489 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 7 May 2015 10:53:56 +0800 Subject: x86, irq: Allocate CPU vectors from device local CPUs if possible On NUMA systems, an IO device may be associated with a NUMA node. It may improve IO performance to allocate resources, such as memory and interrupts, from device local node. This patch introduces a mechanism to support CPU vector allocation policies. It tries to allocate CPU vectors from CPUs on device local node first, and then fallback to all online(global) CPUs. This mechanism may be used to support NumaConnect systems to allocate CPU vectors from device local node. Signed-off-by: Jiang Liu Tested-by: Daniel J Blueman Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1430967244-28905-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2766747e1a3b..b590c9d6736a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -210,6 +210,18 @@ static int assign_irq_vector(int irq, struct apic_chip_data *data, return err; } +static int assign_irq_vector_policy(int irq, int node, + struct apic_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->mask) + return assign_irq_vector(irq, data, info->mask); + if (node != NUMA_NO_NODE && + assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + return 0; + return assign_irq_vector(irq, data, apic->target_cpus()); +} + static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; @@ -258,12 +270,6 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) memset(dst, 0, sizeof(*dst)); } -static inline const struct cpumask * -irq_alloc_info_get_mask(struct irq_alloc_info *info) -{ - return (!info || !info->mask) ? apic->target_cpus() : info->mask; -} - static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { @@ -289,7 +295,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, { struct irq_alloc_info *info = arg; struct apic_chip_data *data; - const struct cpumask *mask; struct irq_data *irq_data; int i, err; @@ -300,7 +305,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; - mask = irq_alloc_info_get_mask(info); for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); @@ -318,7 +322,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, data, mask); + err = assign_irq_vector_policy(virq, irq_data->node, data, + info); if (err) goto error; } -- cgit v1.2.3 From 6af7faf6076697a39438cf38e21b4035e2ebdac9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:48:25 +0200 Subject: x86: Use entering[_ack]_irq() instead of open coding it Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 6 ++---- arch/x86/kernel/cpu/mshyperv.c | 6 ++---- arch/x86/kernel/irq.c | 16 ++++------------ 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b590c9d6736a..28eba2d38b15 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,9 +542,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - exit_idle(); + entering_ack_irq(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -596,7 +594,7 @@ unlock: raw_spin_unlock(&desc->lock); } - irq_exit(); + exiting_irq(); } static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 939155ffdece..aad4bd84b475 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); - exit_idle(); - + entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index fe2ed8bb507b..be3894512820 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -198,8 +198,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); - exit_idle(); + entering_irq(); irq = __this_cpu_read(vector_irq[vector]); @@ -215,7 +214,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); return 1; @@ -250,16 +249,9 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - + entering_ack_irq(); inc_irq_stat(kvm_posted_intr_ipis); - - irq_exit(); - + exiting_irq(); set_irq_regs(old_regs); } #endif -- cgit v1.2.3 From 6dc178760553605c58d78bd403dfcb4e042c5b72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:50:45 +0200 Subject: x86: Consolidate irq entering inlines smp.c and irq_work.c implement the same inline helper. Move it to apic.h and use it everywhere. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- arch/x86/kernel/irq_work.c | 10 ++-------- arch/x86/kernel/smp.c | 19 ++++++------------- 2 files changed, 8 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 15d741ddfeeb..dc5fa6a1e8d6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -10,12 +10,6 @@ #include #include -static inline void irq_work_entering_irq(void) -{ - irq_enter(); - ack_APIC_irq(); -} - static inline void __smp_irq_work_interrupt(void) { inc_irq_stat(apic_irq_work_irqs); @@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void) __visible void smp_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); trace_irq_work_exit(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index be8e1bde07aa..15aaa69bbb5e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { - ack_APIC_irq(); - irq_enter(); + ipi_entering_ack_irq(); stop_this_cpu(NULL); irq_exit(); } @@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -static inline void smp_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* @@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) * scheduler_ipi(). This is OK, since those functions are allowed * to nest. */ - smp_entering_irq(); + ipi_entering_ack_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); @@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void) __visible void smp_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void) __visible void smp_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); -- cgit v1.2.3 From e839004b49c571e20006092cbe9da8f2c95d2e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 16 May 2015 18:17:59 +0200 Subject: x86/asm/head*.S: Change global labels to local Make the disassembly look less confusing: -- head_64.o.before.asm ++ head_64.o.after.asm 0000000000000120 : 120: fc cld 121: 83 3c 24 02 cmpl $0x2,(%rsp) - 125: 0f 84 9d 00 00 00 je 1c8 + 125: 0f 84 9d 00 00 00 je 1c8 12b: 83 3d 00 00 00 00 02 cmpl $0x2,0x0(%rip) # 132 132: 74 7e je 1b2 134: ff 05 00 00 00 00 incl 0x0(%rip) # 13a @@ -1198,9 +1198,7 @@ Disassembly of section .init.text: 1bf: 5a pop %rdx 1c0: 59 pop %rcx 1c1: 58 pop %rax - 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 - -00000000000001c8 : + 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 1c8: 48 83 c4 10 add $0x10,%rsp 1cc: 48 cf iretq -- head_32.o.before.asm ++ head_32.o.after.asm 0000016c : 16c: fc cld 16d: 83 3c 24 02 cmpl $0x2,(%esp) - 171: 74 73 je 1e6 + 171: 74 73 je 1e6 173: 36 83 3d 00 00 00 00 cmpl $0x2,%ss:0x0 17a: 02 17b: 74 5a je 1d7 @@ -483,8 +483,6 @@ Disassembly of section .init.text: 1dd: 59 pop %ecx 1de: 58 pop %eax 1df: 36 ff 0d 00 00 00 00 decl %ss:0x0 - -000001e6 : 1e6: 83 c4 08 add $0x8,%esp 1e9: cf iret 1ea: 66 90 xchg %ax,%ax No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431793079-11153-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 4 ++-- arch/x86/kernel/head_64.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..02d257256200 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -547,7 +547,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%esp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,%ss:early_recursion_flag je hlt_loop @@ -600,7 +600,7 @@ ex_entry: pop %ecx pop %eax decl %ss:early_recursion_flag -is_nmi: +.Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..43eafc8afb69 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -344,7 +344,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%rsp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,early_recursion_flag(%rip) jz 1f @@ -409,7 +409,7 @@ ENTRY(early_idt_handler) popq %rcx popq %rax decl early_recursion_flag(%rip) -is_nmi: +.Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler) -- cgit v1.2.3 From adeb5537849d9db428fe0ddc3562e5a765a347e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 May 2015 22:39:06 +0200 Subject: x86/asm/entry/64: Use shorter MOVs from segment registers The "movw %ds,%cx" instruction needs a 0x66 prefix, while "movl %ds,%ecx" does not. The difference is that latter form (on 64-bit CPUs) overwrites the entire %ecx, not only its lower half. But subsequent code doesn't depend on the value of upper half of %ecx, so we can safely use the shorter instruction. The new code is also faster than the old one - now we don't depend on the old value of %ecx, but this code fragment is not performance-critical so it does not matter much. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1431722346-26585-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 09c3f9e0e07e..47b95813dc37 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1190,17 +1190,17 @@ ENTRY(xen_failsafe_callback) /*CFI_REL_OFFSET ds,DS*/ CFI_REL_OFFSET r11,8 CFI_REL_OFFSET rcx,0 - movw %ds,%cx + movl %ds,%ecx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE jne 1f - movw %es,%cx + movl %es,%ecx cmpw %cx,0x18(%rsp) jne 1f - movw %fs,%cx + movl %fs,%ecx cmpw %cx,0x20(%rsp) jne 1f - movw %gs,%cx + movl %gs,%ecx cmpw %cx,0x28(%rsp) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ -- cgit v1.2.3 From 7cb685982157567dcc55eb92d1c38d237465203b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 May 2015 12:05:13 +0200 Subject: x86/smp/boot: Fix legacy SMP bootup slow-boot bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So while testing kernels using tools/kvm/ (kvmtool) I noticed that it booted super slow: [ 0.142991] Performance Events: no PMU driver, software events only. [ 0.149265] x86: Booting SMP configuration: [ 0.149765] .... node #0, CPUs: #1 [ 0.148304] kvm-clock: cpu 1, msr 2:1bfe9041, secondary cpu clock [ 10.158813] KVM setup async PF for cpu 1 [ 10.159000] #2 [ 10.159000] kvm-stealtime: cpu 1, msr 211a4d400 [ 10.158829] kvm-clock: cpu 2, msr 2:1bfe9081, secondary cpu clock [ 20.167805] KVM setup async PF for cpu 2 [ 20.168000] #3 [ 20.168000] kvm-stealtime: cpu 2, msr 211a8d400 [ 20.167818] kvm-clock: cpu 3, msr 2:1bfe90c1, secondary cpu clock [ 30.176902] KVM setup async PF for cpu 3 [ 30.177000] #4 [ 30.177000] kvm-stealtime: cpu 3, msr 211acd400 One CPU booted up per 10 seconds. With 120 CPUs that takes a while. Bisection pinpointed this commit: 853b160aaafb ("Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen")") But that commit just restores previous behavior, so it cannot cause the problem. After some head scratching it turns out that these two commits: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") d68921f9bd14 ("x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay") added the following code to smpboot.c: - mdelay(10); + mdelay(init_udelay); Note the mismatch in the units: the delay is called 'udelay' and is set to microseconds - while the function used here is actually 'mdelay', which counts in milliseconds ... So the delay for legacy systems is off by a factor of 1,000, so instead of 10 msecs we waited for 10 seconds ... The reason bisection pointed to 853b160aaafb was that 853b160aaafb removed a (broken) boot-time speedup patch, which masked the factor 1,000 bug. Fix it by using udelay(). This fixes my bootup problems. Cc: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b9aaa3930b2f..fd6291c921b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -617,7 +617,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + udelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.3 From ea6cd25058f39ac69623efdcbd94a7fc7d4d13f0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 9 May 2015 20:27:37 -0400 Subject: x86: Rename eisa_set_level_irq to elcr_set_level_irq This routine has been around for over a decade, but with EISA being dead and abandoned for about twice that long, the name can be kind of confusing. The function is going at the PIC Edge/Level Configuration Registers (ELCR), so rename it as such and mentally decouple it from the long since dead EISA bus. Signed-off-by: Paul Gortmaker Reviewed-by: Maciej W. Rozycki Acked-by: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Bjorn Helgaas Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1431217657-934-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 271293ad89d7..e49ee24da85e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -608,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); + elcr_set_level_irq(gsi); #endif return gsi; -- cgit v1.2.3 From a2f1c8bdc02bfcaa5a658283b883fdb54e328b36 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:15 +0800 Subject: x86/irq/msi: Implement irq_set_vcpu_affinity for remapped MSI irqs Implement irq_set_vcpu_affinity for pci_msi_ir_controller. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Link: http://lkml.kernel.org/r/1432026437-16560-3-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ef516afa20bb..1a9d735e09c6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -152,6 +152,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.3 From f6b3c72c23661e5534cd2eede16e9bac7ebb761c Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:16 +0800 Subject: x86/irq: Define a global vector for VT-d Posted-Interrupts Currently, we use a global vector as the Posted-Interrupts Notification Event for all the vCPUs in the system. We need to introduce another global vector for VT-d Posted-Interrtups, which will be used to wakeup the sleep vCPU when an external interrupt from a direct-assigned device happens for that vCPU. [ tglx: Removed a gazillion of extra newlines ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-4-git-send-email-feng.wu@intel.com Suggested-by: Yang Zhang Acked-by: H. Peter Anvin Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 2 ++ 3 files changed, 30 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..22aadc917868 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -916,6 +916,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR \ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ + kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index be3894512820..90b2f7052f5b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -242,6 +242,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs) } #ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else + kvm_posted_intr_wakeup_handler = dummy_handler; +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + /* * Handler for POSTED_INTERRUPT_VECTOR. */ @@ -254,6 +266,20 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); + exiting_irq(); + set_irq_regs(old_regs); +} #endif __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index dc1e08d23552..680723a8e4b6 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -144,6 +144,8 @@ static void __init apic_intr_init(void) #ifdef CONFIG_HAVE_KVM /* IPI for KVM to deliver posted interrupt */ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); + /* IPI for KVM to deliver interrupt to wake up tasks */ + alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ -- cgit v1.2.3 From 501b32653ebf49114cccb9afbf9150cf18fd8700 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:17 +0800 Subject: x86/irq: Show statistics information for posted-interrupts Show the statistics information for notification event and wakeup event for posted-interrupt in /proc/interrupts. [ tglx: Named the short identifiers PIN and PIW to match the long identifiers ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-5-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 90b2f7052f5b..7e10c8b4b318 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); #endif return 0; } -- cgit v1.2.3 From cdeb6048940fa4bfb429e2f1cba0d28a11e20cd5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 May 2015 16:15:47 -0700 Subject: x86/asm/irq: Stop relying on magic JMP behavior for early_idt_handlers The early_idt_handlers asm code generates an array of entry points spaced nine bytes apart. It's not really clear from that code or from the places that reference it what's going on, and the code only works in the first place because GAS never generates two-byte JMP instructions when jumping to global labels. Clean up the code to generate the correct array stride (member size) explicitly. This should be considerably more robust against screw-ups, as GAS will warn if a .fill directive has a negative count. Using '. =' to advance would have been even more robust (it would generate an actual error if it tried to move backwards), but it would pad with nulls, confusing anyone who tries to disassemble the code. The new scheme should be much clearer to future readers. While we're at it, improve the comments and rename the array and common code. Binutils may start relaxing jumps to non-weak labels. If so, this change will fix our build, and we may need to backport this change. Before, on x86_64: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 00 00 00 00 jmpq 9 5: R_X86_64_PC32 early_idt_handler-0x4 ... 48: 66 90 xchg %ax,%ax 4a: 6a 08 pushq $0x8 4c: e9 00 00 00 00 jmpq 51 4d: R_X86_64_PC32 early_idt_handler-0x4 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: e9 00 00 00 00 jmpq 120 11c: R_X86_64_PC32 early_idt_handler-0x4 After: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 14 01 00 00 jmpq 11d ... 48: 6a 08 pushq $0x8 4a: e9 d1 00 00 00 jmpq 120 4f: cc int3 50: cc int3 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: eb 03 jmp 120 11d: cc int3 11e: cc int3 11f: cc int3 Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Binutils Cc: Borislav Petkov Cc: H.J. Lu Cc: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ac027962af343b0c599cbfcf50b945ad2ef3d7a8.1432336324.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_32.S | 33 ++++++++++++++++++--------------- arch/x86/kernel/head_64.S | 20 +++++++++++--------- 3 files changed, 30 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 02d257256200..544dec4cc605 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: .Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 43eafc8afb69..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) .Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA -- cgit v1.2.3 From 5c31b2800d8d3e735e5ecac8fc13d1cf862fd330 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 26 May 2015 10:28:21 +0200 Subject: x86/mce: Fix monarch timeout setting through the mce= cmdline option Using "mce=1,10000000" on the kernel cmdline to change the monarch timeout does not work. The cause is that get_option() does parse a subsequent comma in the option string and signals that with a return value. So we don't need to check for a second comma ourselves. Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1432120943-25028-1-git-send-email-xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1432628901-18044-19-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 521e5016aca6..0cbcd3183acf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2014,11 +2014,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; -- cgit v1.2.3 From 7f0431e3dc8953f41e9433581c1fdd7ee45860b0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:05 +0200 Subject: x86/mm/mtrr: Fix MTRR lookup to handle an inclusive entry When an MTRR entry is inclusive to a requested range, i.e. the start and end of the request are not within the MTRR entry range but the range contains the MTRR entry entirely: range_start ... [mtrr_start ... mtrr_end] ... range_end __mtrr_type_lookup() ignores such a case because both start_state and end_state are set to zero. This bug can cause the following issues: 1) reserve_memtype() tracks an effective memory type in case a request type is WB (ex. /dev/mem blindly uses WB). Missing to track with its effective type causes a subsequent request to map the same range with the effective type to fail. 2) pud_set_huge() and pmd_set_huge() check if a requested range has any overlap with MTRRs. Missing to detect an overlap may cause a performance penalty or undefined behavior. This patch fixes the bug by adding a new flag, 'inclusive', to detect the inclusive case. This case is then handled in the same way as end_state:1 since the first region is the same. With this fix, __mtrr_type_lookup() handles the inclusive case properly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-3-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5b239679cfc9..e202d26f64a2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -154,7 +154,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,19 +166,27 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * * depending on kind of overlap. - * Return the type for first region and a pointer to - * the start of second region so that caller will - * lookup again on the second region. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * * Note: This way we handle multiple overlaps as well. */ if (start_state) -- cgit v1.2.3 From 9b3aca620883fc06636737c82a4d024b22182281 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:06 +0200 Subject: x86/mm/mtrr: Fix MTRR state checks in mtrr_type_lookup() 'mtrr_state.enabled' contains the FE (fixed MTRRs enabled) and E (MTRRs enabled) flags in MSR_MTRRdefType. Intel SDM, section 11.11.2.1, defines these flags as follows: - All MTRRs are disabled when the E flag is clear. The FE flag has no affect when the E flag is clear. - The default type is enabled when the E flag is set. - MTRR variable ranges are enabled when the E flag is set. - MTRR fixed ranges are enabled when both E and FE flags are set. MTRR state checks in __mtrr_type_lookup() do not match with SDM. Hence, this patch makes the following changes: - The current code detects MTRRs disabled when both E and FE flags are clear in mtrr_state.enabled. Fix to detect MTRRs disabled when the E flag is clear. - The current code does not check if the FE bit is set in mtrr_state.enabled when looking at the fixed entries. Fix to check the FE flag. - The current code returns the default type when the E flag is clear in mtrr_state.enabled. However, the default type is UC when the E flag is clear. Remove the code as this case is handled as MTRR disabled with the 1st change. In addition, this patch defines the E and FE flags in mtrr_state.enabled as follows. - FE flag: MTRR_STATE_MTRR_FIXED_ENABLED - E flag: MTRR_STATE_MTRR_ENABLED print_mtrr_state() and x86_get_mtrr_mem_range() are also updated accordingly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++- arch/x86/kernel/cpu/mtrr/generic.c | 15 ++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5f90b85ff22e..70d7c93f4550 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, continue; base = range_state[i].base_pfn; if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && - (mtrr_state.enabled & 1)) { + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e202d26f64a2..b0599dbb899a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -119,14 +119,16 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) if (!mtrr_state_set) return 0xFF; - if (!mtrr_state.enabled) + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) return 0xFF; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { int idx; if (start < 0x80000) { @@ -149,9 +151,6 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -355,7 +354,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -368,7 +369,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { -- cgit v1.2.3 From 3d3ca416d9b0784cfcf244eeeba1bcaf421bc64d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:07 +0200 Subject: x86/mm/mtrr: Use symbolic define as a retval for disabled MTRRs mtrr_type_lookup() returns verbatim 0xFF when MTRRs are disabled. This patch defines MTRR_TYPE_INVALID to clarify the meaning of this value, and documents its usage. Document the return values of the kernel virtual address mapping helpers pud_set_huge(), pmd_set_huge, pud_clear_huge() and pmd_clear_huge(). There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-5-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b0599dbb899a..7b1491c6232d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -104,7 +104,7 @@ static int check_type_overlap(u8 *prev, u8 *curr) /* * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled * *repeat == 1 implies [start:end] spanned across MTRR range and type returned * corresponds only to [start:*partial_end]. * Caller has to lookup again for [*partial_end:end]. @@ -117,10 +117,10 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) *repeat = 0; if (!mtrr_state_set) - return 0xFF; + return MTRR_TYPE_INVALID; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return 0xFF; + return MTRR_TYPE_INVALID; /* Make end inclusive end, instead of exclusive */ end--; @@ -151,7 +151,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -206,7 +206,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } @@ -220,7 +220,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return MTRR_TYPE_WRBACK; } - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; @@ -229,7 +229,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) /* * Returns the effective MTRR type for the region * Error return: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled */ u8 mtrr_type_lookup(u64 start, u64 end) { -- cgit v1.2.3 From 0cc705f56e400764a171055f727d28a48260bb4b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:08 +0200 Subject: x86/mm/mtrr: Clean up mtrr_type_lookup() MTRRs contain fixed and variable entries. mtrr_type_lookup() may repeatedly call __mtrr_type_lookup() to handle a request that overlaps with variable entries. However, __mtrr_type_lookup() also handles the fixed entries, which do not have to be repeated. Therefore, this patch creates separate functions, mtrr_type_lookup_fixed() and mtrr_type_lookup_variable(), to handle the fixed and variable ranges respectively. The patch also updates the function headers to clarify the return values and output argument. It updates comments to clarify that the repeating is necessary to handle overlaps with the default type, since overlaps with multiple entries alone can be handled without such repeating. There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-6-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 138 +++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b1491c6232d..e51100c49eea 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,55 +102,68 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * MTRR_TYPE_INVALID - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Argument: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return MTRR_TYPE_INVALID; - - if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return MTRR_TYPE_INVALID; - /* Make end inclusive end, instead of exclusive */ + /* Make end inclusive instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if ((start < 0x100000) && - (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -186,7 +199,8 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * advised to lookup again after having adjusted start * and end. * - * Note: This way we handle multiple overlaps as well. + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -215,21 +229,18 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * MTRR_TYPE_INVALID - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled */ u8 mtrr_type_lookup(u64 start, u64 end) { @@ -237,22 +248,45 @@ u8 mtrr_type_lookup(u64 start, u64 end) int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) + return mtrr_type_lookup_fixed(start, end); + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); if (check_type_overlap(&prev_type, &type)) return type; } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + return MTRR_TYPE_WRBACK; + return type; } -- cgit v1.2.3 From b73522e0c1be58d3c69b124985b8ccf94e3677f7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:10 +0200 Subject: x86/mm/mtrr: Enhance MTRR checks in kernel mapping helpers This patch adds the argument 'uniform' to mtrr_type_lookup(), which gets set to 1 when a given range is covered uniformly by MTRRs, i.e. the range is fully covered by a single MTRR entry or the default type. Change pud_set_huge() and pmd_set_huge() to honor the 'uniform' flag to see if it is safe to create a huge page mapping in the range. This allows them to create a huge page mapping in a range covered by a single MTRR entry of any memory type. It also detects a non-optimal request properly. They continue to check with the WB type since it does not effectively change the uniform mapping even if a request spans multiple MTRR entries. pmd_set_huge() logs a warning message to a non-optimal request so that driver writers will be aware of such a case. Drivers should make a mapping request aligned to a single MTRR entry when the range is covered by MTRRs. Signed-off-by: Toshi Kani [ Realign, flesh out comments, improve warning message. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-7-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 40 ++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e51100c49eea..f782d9b62cb3 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -147,19 +147,24 @@ static u8 mtrr_type_lookup_fixed(u64 start, u64 end) * Return Value: * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) * - * Output Argument: + * Output Arguments: * repeat - Set to 1 when [start:end] spanned across MTRR range and type * returned corresponds only to [start:*partial_end]. Caller has * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, - int *repeat) + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; + *uniform = 1; /* Make end inclusive instead of exclusive */ end--; @@ -214,6 +219,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } if ((start & mask) != (base & mask)) @@ -225,6 +231,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } @@ -241,10 +248,15 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform = 1, dummy; int repeat; u64 partial_end; @@ -260,14 +272,18 @@ u8 mtrr_type_lookup(u64 start, u64 end) */ if ((start < 0x100000) && (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) - return mtrr_type_lookup_fixed(start, end); + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } /* * Look up the variable ranges. Look of multiple ranges matching * this address and pick type as per MTRR precedence. */ - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. @@ -278,15 +294,19 @@ u8 mtrr_type_lookup(u64 start, u64 end) while (repeat) { prev_type = type; start = partial_end; - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); if (check_type_overlap(&prev_type, &type)) - return type; + goto out; } if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; + type = MTRR_TYPE_WRBACK; +out: + *uniform = is_uniform; return type; } -- cgit v1.2.3 From 2f9e897353fcb99effd6eff22f7b464f8e2a659a Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:12 +0200 Subject: x86/mm/mtrr, pat: Document Write Combining MTRR type effects on PAT / non-PAT pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the effort to phase out MTRR use document write-combining MTRR effects on pages with different non-PAT page attributes flags and different PAT entry values. Extend arch_phys_wc_add() documentation to clarify power of two sizes / boundary requirements as we phase out mtrr_add() use. Lastly hint towards ioremap_uc() for corner cases on device drivers working with devices with mixed regions where MTRR size requirements would otherwise not enable write-combining effective memory types. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Jonathan Corbet Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: linux-fbdev@vger.kernel.org Link: http://lkml.kernel.org/r/1430343851-967-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ea5f363a1948..04aceb7e6443 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -538,6 +538,9 @@ EXPORT_SYMBOL(mtrr_del); * attempts to add a WC MTRR covering size bytes starting at base and * logs an error if this fails. * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * * Drivers must store the return value to pass to mtrr_del_wc_if_needed, * but drivers should not try to interpret that return value. */ -- cgit v1.2.3 From 7d010fdf299929f9583ce5e17da629dcd83c36ef Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:13 +0200 Subject: x86/mm/mtrr: Avoid #ifdeffery with phys_wc_to_mtrr_index() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user but since we're going to bury MTRR next out of access to drivers, expose this last piece of API to drivers in a general fashion only needing io.h for access to helpers. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Abhilash Kesavan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Cristian Stoica Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Matthias Brugger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thierry Reding Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1429722736-4473-1-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 04aceb7e6443..81baf5fee0e1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -580,7 +580,7 @@ void arch_phys_wc_del(int handle) EXPORT_SYMBOL(arch_phys_wc_del); /* - * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * arch_phys_wc_index - translates arch_phys_wc_add's return value * @handle: Return value from arch_phys_wc_add * * This will turn the return value from arch_phys_wc_add into an mtrr @@ -590,14 +590,14 @@ EXPORT_SYMBOL(arch_phys_wc_del); * in printk line. Alas there is an illegitimate use in some ancient * drm ioctls. */ -int phys_wc_to_mtrr_index(int handle) +int arch_phys_wc_index(int handle) { if (handle < MTRR_TO_PHYS_WC_OFFSET) return -1; else return handle - MTRR_TO_PHYS_WC_OFFSET; } -EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); +EXPORT_SYMBOL_GPL(arch_phys_wc_index); /* * HACK ALERT! -- cgit v1.2.3 From f9626104a5b6815ec7d65789dfb900af5fa51e64 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:14 +0200 Subject: x86/mm/mtrr: Generalize runtime disabling of MTRRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible to enable CONFIG_MTRR and CONFIG_X86_PAT and end up with a system with MTRR functionality disabled but PAT functionality enabled. This can happen, for instance, when the Xen hypervisor is used where MTRRs are not supported but PAT is. This can happen on Linux as of commit 47591df50512 ("xen: Support Xen pv-domains using PAT") by Juergen, introduced in v3.19. Technically, we should assume the proper CPU bits would be set to disable MTRRs but we can't always rely on this. At least on the Xen Hypervisor, for instance, only X86_FEATURE_MTRR was disabled as of Xen 4.4 through Xen commit 586ab6a [0], but not X86_FEATURE_K6_MTRR, X86_FEATURE_CENTAUR_MCR, or X86_FEATURE_CYRIX_ARR for instance. Roger Pau Monné has clarified though that although this is technically true we will never support PVH on these CPU types so Xen has no need to disable these bits on those systems. As per Roger, AMD K6, Centaur and VIA chips don't have the necessary hardware extensions to allow running PVH guests [1]. As per Toshi it is also possible for the BIOS to disable MTRR support, in such cases get_mtrr_state() would update the MTRR state as per the BIOS, we need to propagate this information as well. x86 MTRR code relies on quite a bit of checks for mtrr_if being set to check to see if MTRRs did get set up. Instead, lets provide a generic getter for that. This also adds a few checks where they were not before which could potentially safeguard ourselves against incorrect usage of MTRR where this was not desirable. Where possible match error codes as if MTRRs were disabled on arch/x86/include/asm/mtrr.h. Lastly, since disabling MTRRs can happen at run time and we could end up with PAT enabled, best record now in our logs when MTRRs are disabled. [0] ~/devel/xen (git::stable-4.5)$ git describe --contains 586ab6a 4.4.0-rc1~18 [1] http://lists.xenproject.org/archives/html/xen-devel/2015-03/msg03460.html Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Roger Pau Monné Cc: Stefan Bader Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: bhelgaas@google.com Cc: david.vrabel@citrix.com Cc: jbeulich@suse.com Cc: konrad.wilk@oracle.com Cc: venkatesh.pallipadi@intel.com Cc: ville.syrjala@linux.intel.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1426893517-2511-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 4 +++- arch/x86/kernel/cpu/mtrr/main.c | 39 ++++++++++++++++++++++++++++++-------- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 +- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index f782d9b62cb3..3b533cf37c74 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -445,7 +445,7 @@ static void __init print_mtrr_state(void) } /* Grab all of the MTRR state for this CPU into *state */ -void __init get_mtrr_state(void) +bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned long flags; @@ -489,6 +489,8 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 81baf5fee0e1..383efb26e516 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,6 +59,12 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); @@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, int i, replace, error; mtrr_type ltype; - if (!mtrr_if) + if (!mtrr_enabled()) return -ENXIO; error = mtrr_if->validate_add_page(base, size, type); @@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, bool increment) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) unsigned long lbase, lsize; int error = -EINVAL; - if (!mtrr_if) - return -ENXIO; + if (!mtrr_enabled()) + return -ENODEV; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ @@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) */ int mtrr_del(int reg, unsigned long base, unsigned long size) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -548,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled) + if (pat_enabled || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); @@ -737,10 +747,12 @@ void __init mtrr_bp_init(void) } if (mtrr_if) { + __mtrr_enabled = true; set_num_var_ranges(); init_table(); if (use_intel()) { - get_mtrr_state(); + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; @@ -748,10 +760,16 @@ void __init mtrr_bp_init(void) } } } + + if (!mtrr_enabled()) + pr_info("MTRR: Disabled\n"); } void mtrr_ap_init(void) { + if (!mtrr_enabled()) + return; + if (!use_intel() || mtrr_aps_delayed_init) return; /* @@ -777,6 +795,9 @@ void mtrr_save_state(void) { int first_cpu; + if (!mtrr_enabled()) + return; + get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); @@ -785,6 +806,8 @@ void mtrr_save_state(void) void set_mtrr_aps_delayed_init(void) { + if (!mtrr_enabled()) + return; if (!use_intel()) return; @@ -796,7 +819,7 @@ void set_mtrr_aps_delayed_init(void) */ void mtrr_aps_init(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; /* @@ -813,7 +836,7 @@ void mtrr_aps_init(void) void mtrr_bp_restore(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; mtrr_if->set_all(); @@ -821,7 +844,7 @@ void mtrr_bp_restore(void) static int __init mtrr_init_finialize(void) { - if (!mtrr_if) + if (!mtrr_enabled()) return 0; if (use_intel()) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index df5e41f31a27..951884dcc433 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); -void get_mtrr_state(void); +bool get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); -- cgit v1.2.3 From cb32edf65bf2197a2d2226e94c7602dc92e295bb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:15 +0200 Subject: x86/mm/pat: Wrap pat_enabled into a function API We use pat_enabled in x86-specific code to see if PAT is enabled or not but we're granting full access to it even though readers do not need to set it. If, for instance, we granted access to it to modules later they then could override the variable setting... no bueno. This renames pat_enabled to a new static variable __pat_enabled. Folks are redirected to use pat_enabled() now. Code that sets this can only be internal to pat.c. Apart from the early kernel parameter "nopat" to disable PAT, we also have a few cases that disable it later and make use of a helper pat_disable(). It is wrapped under an ifdef but since that code cannot run unless PAT was enabled its not required to wrap it with ifdefs, unwrap that. Likewise, since "nopat" doesn't really change non-PAT systems just remove that ifdef as well. Although we could add and use an early_param_off(), these helpers don't use __read_mostly but we want to keep __read_mostly for __pat_enabled as this is a hot path -- upon boot, for instance, a simple guest may see ~4k accesses to pat_enabled(). Since __read_mostly early boot params are not that common we don't add a helper for them just yet. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Walls Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Christoph Lameter Cc: Daniel Vetter Cc: Dave Airlie Cc: Denys Vlasenko Cc: Doug Ledford Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kyle McMartin Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430425520-22275-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 383efb26e516..e7ed0d8ebacb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -558,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled || !mtrr_enabled()) + if (pat_enabled() || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); -- cgit v1.2.3 From 131484c8da97ed600c18dd9d03b661e8ae052df6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2015 12:21:47 +0200 Subject: x86/debug: Remove perpetually broken, unmaintainable dwarf annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So the dwarf2 annotations in low level assembly code have become an increasing hindrance: unreadable, messy macros mixed into some of the most security sensitive code paths of the Linux kernel. These debug info annotations don't even buy the upstream kernel anything: dwarf driven stack unwinding has caused problems in the past so it's out of tree, and the upstream kernel only uses the much more robust framepointers based stack unwinding method. In addition to that there's a steady, slow bitrot going on with these annotations, requiring frequent fixups. There's no tooling and no functionality upstream that keeps it correct. So burn down the sick forest, allowing new, healthier growth: 27 files changed, 350 insertions(+), 1101 deletions(-) Someone who has the willingness and time to do this properly can attempt to reintroduce dwarf debuginfo in x86 assembly code plus dwarf unwinding from first principles, with the following conditions: - it should be maximally readable, and maximally low-key to 'ordinary' code reading and maintenance. - find a build time method to insert dwarf annotations automatically in the most common cases, for pop/push instructions that manipulate the stack pointer. This could be done for example via a preprocessing step that just looks for common patterns - plus special annotations for the few cases where we want to depart from the default. We have hundreds of CFI annotations, so automating most of that makes sense. - it should come with build tooling checks that ensure that CFI annotations are sensible. We've seen such efforts from the framepointer side, and there's no reason it couldn't be done on the dwarf side. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frédéric Weisbecker Cc: Jan Beulich Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 368 +++++++++++++-------------------------------- arch/x86/kernel/entry_64.S | 288 ++++++----------------------------- 2 files changed, 156 insertions(+), 500 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 1c309763e321..0ac73de925d1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -113,11 +112,10 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl_cfi $0 + pushl $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp - CFI_ADJUST_CFA_OFFSET -(4 + \pop) .endm .macro POP_GS_EX .endm @@ -137,16 +135,13 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl_cfi %gs - /*CFI_REL_OFFSET gs, 0*/ + pushl %gs .endm .macro POP_GS pop=0 -98: popl_cfi %gs - /*CFI_RESTORE gs*/ +98: popl %gs .if \pop <> 0 add $\pop, %esp - CFI_ADJUST_CFA_OFFSET -\pop .endif .endm .macro POP_GS_EX @@ -170,11 +165,9 @@ .macro GS_TO_REG reg movl %gs, \reg - /*CFI_REGISTER gs, \reg*/ .endm .macro REG_TO_PTGS reg movl \reg, PT_GS(%esp) - /*CFI_REL_OFFSET gs, PT_GS*/ .endm .macro SET_KERNEL_GS reg movl $(__KERNEL_STACK_CANARY), \reg @@ -186,26 +179,16 @@ .macro SAVE_ALL cld PUSH_GS - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0;*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0;*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0;*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es @@ -215,30 +198,20 @@ .endm .macro RESTORE_INT_REGS - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl_cfi %ds - /*CFI_RESTORE ds;*/ -2: popl_cfi %es - /*CFI_RESTORE es;*/ -3: popl_cfi %fs - /*CFI_RESTORE fs;*/ +1: popl %ds +2: popl %es +3: popl %fs POP_GS \pop .pushsection .fixup, "ax" 4: movl $0, (%esp) @@ -254,64 +227,27 @@ POP_GS_EX .endm -.macro RING0_INT_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 3*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_EC_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 4*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_PTREGS_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ - CFI_OFFSET eip, PT_EIP-PT_OLDESP - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ - CFI_OFFSET eax, PT_EAX-PT_OLDESP - CFI_OFFSET ebp, PT_EBP-PT_OLDESP - CFI_OFFSET edi, PT_EDI-PT_OLDESP - CFI_OFFSET esi, PT_ESI-PT_OLDESP - CFI_OFFSET edx, PT_EDX-PT_OLDESP - CFI_OFFSET ecx, PT_ECX-PT_OLDESP - CFI_OFFSET ebx, PT_EBX-PT_OLDESP -.endm - ENTRY(ret_from_fork) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl jmp syscall_exit - CFI_ENDPROC END(ret_from_fork) ENTRY(ret_from_kernel_thread) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl movl PT_EBP(%esp),%eax call *PT_EBX(%esp) movl $0,PT_EAX(%esp) jmp syscall_exit - CFI_ENDPROC ENDPROC(ret_from_kernel_thread) /* @@ -323,7 +259,6 @@ ENDPROC(ret_from_kernel_thread) # userspace resumption stub bypassing syscall exit tracing ALIGN - RING0_PTREGS_FRAME ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: @@ -367,17 +302,12 @@ need_resched: jmp need_resched END(resume_kernel) #endif - CFI_ENDPROC /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub ENTRY(ia32_sysenter_target) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -385,14 +315,11 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl_cfi $__USER_DS - /*CFI_REL_OFFSET ss, 0*/ - pushl_cfi %ebp - CFI_REL_OFFSET esp, 0 - pushfl_cfi + pushl $__USER_DS + pushl %ebp + pushfl orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $__USER_CS - /*CFI_REL_OFFSET cs, 0*/ + pushl $__USER_CS /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary: TI_sysenter_return @@ -401,10 +328,9 @@ sysenter_past_esp: * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - CFI_REL_OFFSET eip, 0 + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - pushl_cfi %eax + pushl %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -453,11 +379,11 @@ sysenter_audit: /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ - pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ call __audit_syscall_entry - popl_cfi %ecx /* get that remapped edx off the stack */ - popl_cfi %ecx /* get that remapped esi off the stack */ + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -480,7 +406,6 @@ sysexit_audit: jmp sysenter_exit #endif - CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b @@ -491,9 +416,8 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) - RING0_INT_FRAME # can't unwind into user space anyway ASM_CLAC - pushl_cfi %eax # save orig_eax + pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -527,7 +451,6 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS #endif restore_nocheck: @@ -543,7 +466,6 @@ ENTRY(iret_exc) _ASM_EXTABLE(irq_return,iret_exc) #ifdef CONFIG_X86_ESPFIX32 - CFI_RESTORE_STATE ldt_ss: #ifdef CONFIG_PARAVIRT /* @@ -577,22 +499,19 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl_cfi $__ESPFIX_SS - pushl_cfi %eax /* new kernel esp */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck #endif - CFI_ENDPROC ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN - RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig @@ -634,9 +553,9 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 ALIGN work_notifysig_v86: - pushl_cfi %ecx # save ti_flags for do_notify_resume + pushl %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl_cfi %ecx + popl %ecx movl %eax, %esp jmp 1b #endif @@ -666,9 +585,7 @@ syscall_exit_work: call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) - CFI_ENDPROC - RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: ASM_CLAC GET_THREAD_INFO(%ebp) @@ -685,7 +602,6 @@ sysenter_badsys: movl $-ENOSYS,%eax jmp sysenter_after_call END(sysenter_badsys) - CFI_ENDPROC .macro FIXUP_ESPFIX_STACK /* @@ -701,10 +617,9 @@ END(sysenter_badsys) mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl_cfi $__KERNEL_DS - pushl_cfi %eax + pushl $__KERNEL_DS + pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ - CFI_ADJUST_CFA_OFFSET -8 #endif .endm .macro UNWIND_ESPFIX_STACK @@ -728,13 +643,11 @@ END(sysenter_badsys) */ .align 8 ENTRY(irq_entries_start) - RING0_INT_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushl $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -4 .align 8 .endr END(irq_entries_start) @@ -753,19 +666,16 @@ common_interrupt: call do_IRQ jmp ret_from_intr ENDPROC(common_interrupt) - CFI_ENDPROC #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ - RING0_INT_FRAME; \ ASM_CLAC; \ - pushl_cfi $~(nr); \ + pushl $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ call fn; \ jmp ret_from_intr; \ - CFI_ENDPROC; \ ENDPROC(name) @@ -784,37 +694,31 @@ ENDPROC(name) #include ENTRY(coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_error + pushl $0 + pushl $do_coprocessor_error jmp error_code - CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 + pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl_cfi $do_general_protection", \ + ALTERNATIVE "pushl $do_general_protection", \ "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else - pushl_cfi $do_simd_coprocessor_error + pushl $do_simd_coprocessor_error #endif jmp error_code - CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available + pushl $-1 # mark this as an int + pushl $do_device_not_available jmp error_code - CFI_ENDPROC END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -830,115 +734,89 @@ END(native_irq_enable_sysexit) #endif ENTRY(overflow) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_overflow + pushl $0 + pushl $do_overflow jmp error_code - CFI_ENDPROC END(overflow) ENTRY(bounds) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_bounds + pushl $0 + pushl $do_bounds jmp error_code - CFI_ENDPROC END(bounds) ENTRY(invalid_op) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_invalid_op + pushl $0 + pushl $do_invalid_op jmp error_code - CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun + pushl $0 + pushl $do_coprocessor_segment_overrun jmp error_code - CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_invalid_TSS + pushl $do_invalid_TSS jmp error_code - CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_segment_not_present + pushl $do_segment_not_present jmp error_code - CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_stack_segment + pushl $do_stack_segment jmp error_code - CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_alignment_check + pushl $do_alignment_check jmp error_code - CFI_ENDPROC END(alignment_check) ENTRY(divide_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 # no error code - pushl_cfi $do_divide_error + pushl $0 # no error code + pushl $do_divide_error jmp error_code - CFI_ENDPROC END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi machine_check_vector + pushl $0 + pushl machine_check_vector jmp error_code - CFI_ENDPROC END(machine_check) #endif ENTRY(spurious_interrupt_bug) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug + pushl $0 + pushl $do_spurious_interrupt_bug jmp error_code - CFI_ENDPROC END(spurious_interrupt_bug) #ifdef CONFIG_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) - RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ - CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp - CFI_ENDPROC ENTRY(xen_hypervisor_callback) - CFI_STARTPROC - pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF @@ -962,7 +840,6 @@ ENTRY(xen_do_upcall) call xen_maybe_preempt_hcall #endif jmp ret_from_intr - CFI_ENDPROC ENDPROC(xen_hypervisor_callback) # Hypervisor uses this for application faults while it executes. @@ -976,8 +853,7 @@ ENDPROC(xen_hypervisor_callback) # to pop the stack frame we end up in an infinite loop of failsafe callbacks. # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es @@ -986,15 +862,13 @@ ENTRY(xen_failsafe_callback) /* EAX == 0 => Category 1 (Bad segment) EAX != 0 => Category 2 (Bad IRET) */ testl %eax,%eax - popl_cfi %eax + popl %eax lea 16(%esp),%esp - CFI_ADJUST_CFA_OFFSET -16 jz 5f jmp iret_exc -5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ +5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp ret_from_exception - CFI_ENDPROC .section .fixup,"ax" 6: xorl %eax,%eax @@ -1195,34 +1069,28 @@ return_to_handler: #ifdef CONFIG_TRACING ENTRY(trace_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $trace_do_page_fault + pushl $trace_do_page_fault jmp error_code - CFI_ENDPROC END(trace_page_fault) #endif ENTRY(page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_page_fault + pushl $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1240,7 +1108,6 @@ error_code: movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception - CFI_ENDPROC END(page_fault) /* @@ -1261,29 +1128,24 @@ END(page_fault) jne \ok \label: movl TSS_sysenter_sp0 + \offset(%esp), %esp - CFI_DEF_CFA esp, 0 - CFI_UNDEFINED eip - pushfl_cfi - pushl_cfi $__KERNEL_CS - pushl_cfi $sysenter_past_esp - CFI_REL_OFFSET eip, 0 + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp .endm ENTRY(debug) - RING0_INT_FRAME ASM_CLAC cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug jmp ret_from_exception - CFI_ENDPROC END(debug) /* @@ -1295,45 +1157,40 @@ END(debug) * fault happened on the sysenter path. */ ENTRY(nmi) - RING0_INT_FRAME ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 - pushl_cfi %eax + pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl_cfi %eax + popl %eax je nmi_espfix_stack #endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl_cfi %eax + pushl %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl_cfi %eax + popl %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl_cfi %eax + pushl %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi jmp restore_all_notrace - CFI_ENDPROC nmi_stack_fixup: - RING0_INT_FRAME FIX_STACK 12, nmi_stack_correct, 1 jmp nmi_stack_correct nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -1345,57 +1202,48 @@ nmi_debug_stack_check: #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * + /* * create the pointer to lss back */ - pushl_cfi %ss - pushl_cfi %esp + pushl %ss + pushl %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl_cfi 16(%esp) + pushl 16(%esp) .endr - pushl_cfi %eax + pushl %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 jmp irq_return #endif - CFI_ENDPROC END(nmi) ENTRY(int3) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 jmp ret_from_exception - CFI_ENDPROC END(int3) ENTRY(general_protection) - RING0_EC_FRAME - pushl_cfi $do_general_protection + pushl $do_general_protection jmp error_code - CFI_ENDPROC END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_async_page_fault + pushl $do_async_page_fault jmp error_code - CFI_ENDPROC END(async_page_fault) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..b84cec50c8cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -19,8 +19,6 @@ * at the top of the kernel process stack. * * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. * - ENTRY/END Define functions in the symbol table. * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. @@ -30,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -112,61 +109,6 @@ ENDPROC(native_usergs_sysret64) # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif -/* - * empty frame - */ - .macro EMPTY_FRAME start=1 offset=0 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,8+\offset - .else - CFI_DEF_CFA_OFFSET 8+\offset - .endif - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) - */ - .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, 5*8+\offset - /*CFI_REL_OFFSET ss, 4*8+\offset*/ - CFI_REL_OFFSET rsp, 3*8+\offset - /*CFI_REL_OFFSET rflags, 2*8+\offset*/ - /*CFI_REL_OFFSET cs, 1*8+\offset*/ - CFI_REL_OFFSET rip, 0*8+\offset - .endm - -/* - * initial frame state for exceptions with error code (and interrupts - * with vector already pushed) - */ - .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, 1*8+\offset - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset - CFI_REL_OFFSET rdi, RDI+\offset - CFI_REL_OFFSET rsi, RSI+\offset - CFI_REL_OFFSET rdx, RDX+\offset - CFI_REL_OFFSET rcx, RCX+\offset - CFI_REL_OFFSET rax, RAX+\offset - CFI_REL_OFFSET r8, R8+\offset - CFI_REL_OFFSET r9, R9+\offset - CFI_REL_OFFSET r10, R10+\offset - CFI_REL_OFFSET r11, R11+\offset - CFI_REL_OFFSET rbx, RBX+\offset - CFI_REL_OFFSET rbp, RBP+\offset - CFI_REL_OFFSET r12, R12+\offset - CFI_REL_OFFSET r13, R13+\offset - CFI_REL_OFFSET r14, R14+\offset - CFI_REL_OFFSET r15, R15+\offset - .endm - /* * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -196,12 +138,6 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -219,8 +155,8 @@ GLOBAL(system_call_after_swapgs) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ - pushq_cfi $__USER_DS /* pt_regs->ss */ - pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -229,22 +165,20 @@ GLOBAL(system_call_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - pushq_cfi_reg r8 /* pt_regs->r8 */ - pushq_cfi_reg r9 /* pt_regs->r9 */ - pushq_cfi_reg r10 /* pt_regs->r10 */ - pushq_cfi_reg r11 /* pt_regs->r11 */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 6*8 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys @@ -282,13 +216,9 @@ system_call_fastpath: testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - CFI_REMEMBER_STATE - RESTORE_C_REGS_EXCEPT_RCX_R11 movq RIP(%rsp),%rcx - CFI_REGISTER rip,rcx movq EFLAGS(%rsp),%r11 - /*CFI_REGISTER rflags,r11*/ movq RSP(%rsp),%rsp /* * 64bit SYSRET restores rip from rcx, @@ -307,8 +237,6 @@ system_call_fastpath: */ USERGS_SYSRET64 - CFI_RESTORE_STATE - /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi @@ -374,9 +302,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -389,10 +317,10 @@ int_very_careful: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq_cfi %rdi + pushq %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq_cfi %rdi + popq %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -475,27 +403,21 @@ syscall_return: * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - CFI_REMEMBER_STATE /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 - CFI_RESTORE_STATE opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret - CFI_ENDPROC END(system_call) .macro FORK_LIKE func ENTRY(stub_\func) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 jmp sys_\func - CFI_ENDPROC END(stub_\func) .endm @@ -504,8 +426,6 @@ END(stub_\func) FORK_LIKE vfork ENTRY(stub_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execve return_from_execve: testl %eax, %eax @@ -515,11 +435,9 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_execve) /* * Remaining execve stubs are only 7 bytes long. @@ -527,32 +445,23 @@ END(stub_execve) */ .align 8 GLOBAL(stub_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub_execveat) #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) GLOBAL(stub32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve - CFI_ENDPROC END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) GLOBAL(stub32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub32_execveat) END(stub_x32_execveat) #endif @@ -562,8 +471,6 @@ END(stub_x32_execveat) * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* * SAVE_EXTRA_REGS result is not normally needed: * sigreturn overwrites all pt_regs->GPREGS. @@ -575,21 +482,16 @@ ENTRY(stub_rt_sigreturn) call sys_rt_sigreturn return_from_stub: addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn jmp return_from_stub - CFI_ENDPROC END(stub_x32_rt_sigreturn) #endif @@ -599,12 +501,11 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - DEFAULT_FRAME LOCK ; btr $TIF_FORK,TI_flags(%r8) - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags + pushq $0x0002 + popfq # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -628,7 +529,6 @@ ENTRY(ret_from_fork) movl $0, RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call - CFI_ENDPROC END(ret_from_fork) /* @@ -637,16 +537,13 @@ END(ret_from_fork) */ .align 8 ENTRY(irq_entries_start) - INTR_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -8 .align 8 .endr - CFI_ENDPROC END(irq_entries_start) /* @@ -688,17 +585,7 @@ END(irq_entries_start) movq %rsp, %rsi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi pushq %rsi - /* - * For debugger: - * "CFA (Current Frame Address) is the value on stack + offset" - */ - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 (rsp) */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ - 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -711,7 +598,6 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - XCPT_FRAME ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ @@ -723,11 +609,8 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ /* return code expects complete pt_regs - adjust rsp accordingly: */ leaq -RBP(%rsi),%rsp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP testb $3, CS(%rsp) jz retint_kernel @@ -743,7 +626,6 @@ retint_check: LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx andl %edi,%edx - CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: /* return to user-space */ @@ -807,8 +689,8 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq_cfi %rax - pushq_cfi %rdi + pushq %rax + pushq %rdi SWAPGS movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ @@ -823,24 +705,23 @@ native_irq_return_ldt: movq (5*8)(%rsp),%rax /* RSP */ movq %rax,(4*8)(%rdi) andl $0xffff0000,%eax - popq_cfi %rdi + popq %rdi orq PER_CPU_VAR(espfix_stack),%rax SWAPGS movq %rax,%rsp - popq_cfi %rax + popq %rax jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -862,7 +743,6 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule - CFI_ENDPROC END(common_interrupt) /* @@ -870,13 +750,11 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) - INTR_FRAME ASM_CLAC - pushq_cfi $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym jmp ret_from_intr - CFI_ENDPROC END(\sym) .endm @@ -959,24 +837,17 @@ ENTRY(\sym) .error "using shift_ist requires paranoid=1" .endif - .if \has_error_code - XCPT_FRAME - .else - INTR_FRAME - .endif - ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - CFI_REMEMBER_STATE testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif @@ -986,8 +857,6 @@ ENTRY(\sym) .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - DEFAULT_FRAME 0 - .if \paranoid .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ @@ -1023,7 +892,6 @@ ENTRY(\sym) .endif .if \paranoid == 1 - CFI_RESTORE_STATE /* * Paranoid entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers @@ -1032,7 +900,6 @@ ENTRY(\sym) 1: call error_entry - DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ call sync_regs @@ -1051,8 +918,6 @@ ENTRY(\sym) jmp error_exit /* %ebx: no swapgs flag */ .endif - - CFI_ENDPROC END(\sym) .endm @@ -1085,17 +950,15 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(native_load_gs_index) - CFI_STARTPROC - pushfq_cfi + pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popfq_cfi + popfq ret - CFI_ENDPROC END(native_load_gs_index) _ASM_EXTABLE(gs_change,bad_gs) @@ -1110,22 +973,15 @@ bad_gs: /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - CFI_STARTPROC - pushq_cfi %rbp - CFI_REL_OFFSET rbp,0 + pushq %rbp mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr),%rsp push %rbp # backlink for old unwinder call __do_softirq leaveq - CFI_RESTORE rbp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) ret - CFI_ENDPROC END(do_softirq_own_stack) #ifdef CONFIG_XEN @@ -1145,28 +1001,22 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * activation and restart the handler using the previous one. */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) - CFI_STARTPROC /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ movq %rdi, %rsp # we don't return, adjust the stack frame - CFI_ENDPROC - DEFAULT_FRAME 11: incl PER_CPU_VAR(irq_count) movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp pushq %rbp # backlink for old unwinder call xen_evtchn_do_upcall popq %rsp - CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif jmp error_exit - CFI_ENDPROC END(xen_do_hypervisor_callback) /* @@ -1183,16 +1033,8 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - INTR_FRAME 1 (6*8) - /*CFI_REL_OFFSET gs,GS*/ - /*CFI_REL_OFFSET fs,FS*/ - /*CFI_REL_OFFSET es,ES*/ - /*CFI_REL_OFFSET ds,DS*/ - CFI_REL_OFFSET r11,8 - CFI_REL_OFFSET rcx,0 movl %ds,%ecx cmpw %cx,0x10(%rsp) - CFI_REMEMBER_STATE jne 1f movl %es,%ecx cmpw %cx,0x18(%rsp) @@ -1205,29 +1047,21 @@ ENTRY(xen_failsafe_callback) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 /* RIP */ - pushq_cfi %r11 - pushq_cfi %rcx + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx jmp general_protection - CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS jmp error_exit - CFI_ENDPROC END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1263,7 +1097,6 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1275,7 +1108,6 @@ ENTRY(paranoid_entry) SWAPGS xorl %ebx,%ebx 1: ret - CFI_ENDPROC END(paranoid_entry) /* @@ -1290,7 +1122,6 @@ END(paranoid_entry) */ /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ @@ -1305,7 +1136,6 @@ paranoid_exit_restore: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN - CFI_ENDPROC END(paranoid_exit) /* @@ -1313,7 +1143,6 @@ END(paranoid_exit) * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1333,7 +1162,6 @@ error_sti: * for these here too. */ error_kernelspace: - CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) @@ -1357,13 +1185,11 @@ error_bad_iret: mov %rax,%rsp decl %ebx /* Return to usergs */ jmp error_sti - CFI_ENDPROC END(error_entry) /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) - DEFAULT_FRAME movl %ebx,%eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) @@ -1377,12 +1203,10 @@ ENTRY(error_exit) andl %edi,%edx jnz retint_careful jmp retint_swapgs - CFI_ENDPROC END(error_exit) /* Runs on exception stack */ ENTRY(nmi) - INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1417,8 +1241,7 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1452,8 +1275,6 @@ ENTRY(nmi) jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - CFI_REMEMBER_STATE - nested_nmi: /* * Do nothing if we interrupted the fixup in repeat_nmi. @@ -1471,26 +1292,22 @@ nested_nmi: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 1*8 leaq -10*8(%rsp), %rdx - pushq_cfi $__KERNEL_DS - pushq_cfi %rdx - pushfq_cfi - pushq_cfi $__KERNEL_CS - pushq_cfi $repeat_nmi + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ addq $(6*8), %rsp - CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: - popq_cfi %rdx - CFI_RESTORE rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN - CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1529,22 +1346,19 @@ first_nmi: */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx - CFI_RESTORE rdx /* Set the NMI executing variable on the stack. */ - pushq_cfi $1 + pushq $1 /* * Leave room for the "copied" frame */ subq $(5*8), %rsp - CFI_ADJUST_CFA_OFFSET 5*8 /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 11*8(%rsp) + pushq 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1567,12 +1381,10 @@ repeat_nmi: /* Make another copy, this one may be modified by nested NMIs */ addq $(10*8), %rsp - CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi -6*8(%rsp) + pushq -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1580,7 +1392,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1591,7 +1403,6 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry - DEFAULT_FRAME 0 /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1628,13 +1439,10 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) jmp irq_return - CFI_ENDPROC END(nmi) ENTRY(ignore_sysret) - CFI_STARTPROC mov $-ENOSYS,%eax sysret - CFI_ENDPROC END(ignore_sysret) -- cgit v1.2.3 From 2f63b9db7260beba3c19d66d6c11b0b78ea84a8c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 1 Jun 2015 13:03:59 +0100 Subject: x86/asm/entry/64: Fold identical code paths retint_kernel doesn't require %rcx to be pointing to thread info (anymore?), and the code on the two alternative paths is - not really surprisingly - identical. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/556C664F020000780007FB64@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b84cec50c8cf..4ad79e946f5a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -615,7 +615,7 @@ ret_from_intr: testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ - +retint_user: GET_THREAD_INFO(%rcx) /* * %rcx: thread info. Interrupts off. @@ -1194,15 +1194,9 @@ ENTRY(error_exit) RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) testl %eax,%eax jnz retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs + jmp retint_user END(error_exit) /* Runs on exception stack */ -- cgit v1.2.3 From d6472302f242559d45dcf4ebace62508dc4d8aeb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 2 Jun 2015 19:01:38 +1000 Subject: x86/mm: Decouple from Nothing in uses anything from , so remove it from there and fix up the resulting build problems triggered on x86 {64|32}-bit {def|allmod|allno}configs. The breakages were triggering in places where x86 builds relied on vmalloc() facilities but did not include explicitly and relied on the implicit inclusion via . Also add: - to - to ... which were two other implicit header file dependencies. Suggested-by: David Miller Signed-off-by: Stephen Rothwell [ Tidied up the changelog. ] Acked-by: David Miller Acked-by: Takashi Iwai Acked-by: Viresh Kumar Acked-by: Vinod Koul Cc: Andrew Morton Cc: Anton Vorontsov Cc: Boris Ostrovsky Cc: Colin Cross Cc: David Vrabel Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: James E.J. Bottomley Cc: Jaroslav Kysela Cc: K. Y. Srinivasan Cc: Kees Cook Cc: Konrad Rzeszutek Wilk Cc: Kristen Carlson Accardi Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Suma Ramars Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 1 + arch/x86/kernel/machine_kexec_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c76d3e37c6e1..e068d6683dba 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..11546b462fa6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 905a36a2851838bca5a424fb758e201990234e6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 13:37:36 +0200 Subject: x86/asm/entry: Move entry_64.S and entry_32.S to arch/x86/entry/ Create a new directory hierarchy for the low level x86 entry code: arch/x86/entry/* This will host all the low level glue that is currently scattered all across arch/x86/. Start with entry_64.S and entry_32.S. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_32.S | 1249 -------------------------------------- arch/x86/kernel/entry_64.S | 1442 -------------------------------------------- 3 files changed, 1 insertion(+), 2692 deletions(-) delete mode 100644 arch/x86/kernel/entry_32.S delete mode 100644 arch/x86/kernel/entry_64.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..9d3ee054453d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S deleted file mode 100644 index 0ac73de925d1..000000000000 --- a/arch/x86/kernel/entry_32.S +++ /dev/null @@ -1,1249 +0,0 @@ -/* - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS - * 2C(%esp) - orig_eax - * 30(%esp) - %eip - * 34(%esp) - %cs - * 38(%esp) - %eflags - * 3C(%esp) - %oldesp - * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work -#endif - - .section .entry.text, "ax" - -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -#define preempt_stop(clobbers) -#define resume_kernel restore_all -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl %gs -.endm - -.macro POP_GS pop=0 -98: popl %gs - .if \pop <> 0 - add $\pop, %esp - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - -.macro SAVE_ALL - cld - PUSH_GS - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs - SET_KERNEL_GS %edx -.endm - -.macro RESTORE_INT_REGS - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax -.endm - -.macro RESTORE_REGS pop=0 - RESTORE_INT_REGS -1: popl %ds -2: popl %es -3: popl %fs - POP_GS \pop -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) - POP_GS_EX -.endm - -ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - jmp syscall_exit -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit -ENDPROC(ret_from_kernel_thread) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN -ret_from_exception: - preempt_stop(CLBR_ANY) -ret_from_intr: - GET_THREAD_INFO(%ebp) -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace - -ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all -END(ret_from_exception) - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched -END(resume_kernel) -#endif - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(ia32_sysenter_target) - movl TSS_sysenter_sp0(%esp),%esp -sysenter_past_esp: - /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. - */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp),%ebp - ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) -sysenter_after_call: - movl %eax,PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit -#endif - -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b -.popsection - _ASM_EXTABLE(1b,2b) - PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) - - # system call handler stub -ENTRY(system_call) - ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) -syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work - -restore_all: - TRACE_IRQS_IRET -restore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS -#endif -restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: - INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous - _ASM_EXTABLE(irq_return,iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 -ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck -#endif -ENDPROC(system_call) - - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call -END(sysenter_badsys) - -.macro FIXUP_ESPFIX_STACK -/* - * Switch back for ESPFIX stack to the normal zerobased stack - * - * We can't call C functions using the ESPFIX stack. This code reads - * the high word of the segment base from the GDT and swiches to the - * normal stack and adjusts ESP with the matching offset. - */ -#ifdef CONFIG_X86_ESPFIX32 - /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - pushl %eax - lss (%esp), %esp /* switch to the normal stack segment */ -#endif -.endm -.macro UNWIND_ESPFIX_STACK -#ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax - /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - /* switch to normal stack */ - FIXUP_ESPFIX_STACK -27: -#endif -.endm - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ - SAVE_ALL - TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr -ENDPROC(common_interrupt) - -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ -ENDPROC(name) - - -#ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -#define TRACE_BUILD_INTERRUPT(name, nr) -#endif - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) - -/* The include is where all of the SMP etc. interrupts come from */ -#include - -ENTRY(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp error_code -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp error_code -END(simd_coprocessor_error) - -ENTRY(device_not_available) - ASM_CLAC - pushl $-1 # mark this as an int - pushl $do_device_not_available - jmp error_code -END(device_not_available) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) -#endif - -ENTRY(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp error_code -END(overflow) - -ENTRY(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp error_code -END(bounds) - -ENTRY(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp error_code -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code -END(coprocessor_segment_overrun) - -ENTRY(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp error_code -END(invalid_TSS) - -ENTRY(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp error_code -END(segment_not_present) - -ENTRY(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp error_code -END(stack_segment) - -ENTRY(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp error_code -END(alignment_check) - -ENTRY(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp error_code -END(divide_error) - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - ASM_CLAC - pushl $0 - pushl machine_check_vector - jmp error_code -END(machine_check) -#endif - -ENTRY(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp error_code -END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - -ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - TRACE_IRQS_OFF - - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -ENDPROC(xen_hypervisor_callback) - -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. -ENTRY(xen_failsafe_callback) - pushl %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl %eax - lea 16(%esp),%esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - jmp ret_from_exception - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) -ENDPROC(xen_failsafe_callback) - -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -#endif /* CONFIG_HYPERV */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -ftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -.globl ftrace_stub -ftrace_stub: - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz trace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp error_code -END(trace_page_fault) -#endif - -ENTRY(page_fault) - ASM_CLAC - pushl $do_page_fault - ALIGN -error_code: - /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception -END(page_fault) - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - -ENTRY(debug) - ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception -END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - ASM_CLAC -#ifdef CONFIG_X86_ESPFIX32 - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_espfix_stack -#endif - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace - -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct - -#ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: - /* - * create the pointer to lss back - */ - pushl %ss - pushl %esp - addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return -#endif -END(nmi) - -ENTRY(int3) - ASM_CLAC - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception -END(int3) - -ENTRY(general_protection) - pushl $do_general_protection - jmp error_code -END(general_protection) - -#ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp error_code -END(async_page_fault) -#endif - diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S deleted file mode 100644 index 4ad79e946f5a..000000000000 --- a/arch/x86/kernel/entry_64.S +++ /dev/null @@ -1,1442 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * Some of this is documented in Documentation/x86/entry_64.txt - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - * Some macro usage: - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" - - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret64) - swapgs - sysretq -ENDPROC(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT */ - - -.macro TRACE_IRQS_IRETQ -#ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - -/* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. - * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Registers on entry: - * rax system call number - * rcx return address - * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) - * rdi arg0 - * rsi arg1 - * rdx arg2 - * r10 arg3 (needs to be moved to rcx to conform to C ABI) - * r8 arg4 - * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) - * - * Only called from user space. - * - * When user can change pt_regs->foo always force IRET. That is because - * it deals with uncanonical addresses better. SYSRET has trouble - * with them due to bugs in both AMD and Intel CPUs. - */ - -ENTRY(system_call) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(system_call_after_swapgs) - - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - - /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys -system_call_fastpath: -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT - /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - DISABLE_INTERRUPTS(CLBR_NONE) - - /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. - */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - movq EFLAGS(%rsp),%r11 - movq RSP(%rsp),%rsp - /* - * 64bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ - USERGS_SYSRET64 - - /* Do syscall entry tracing */ -tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ - -tracesys_phase2: - SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - /* Use IRET because user could have changed pt_regs->foo */ - -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) - DISABLE_INTERRUPTS(CLBR_NONE) -int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ - TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -GLOBAL(int_with_check) - LOCKDEP_SYS_EXIT_IRQ - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp syscall_return - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full pt_regs */ -int_very_careful: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest - -int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi -int_restore_rest: - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - -syscall_return: - /* The IRETQ could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - /* Change top 16 bits to be the sign-extension of 47th bit */ - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne opportunistic_sysret_failed - - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: - * - * movq $stuck_here,%rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp - USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret -END(system_call) - - - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) - .align 8 -GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub32_execve) -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub32_execveat) -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. - */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) - -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif - -/* - * A newly forked process directly context switches into this address. - * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq $0x0002 - popfq # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - RESTORE_EXTRA_REGS - - testb $3, CS(%rsp) # from kernel_thread? - - /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. - * Use IRET code path to return, since it can safely handle - * all of the above. - */ - jnz int_ret_from_sys_call - - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call -END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): ~(interrupt number) */ - .macro interrupt func - cld - /* - * Since nothing in interrupt handling code touches r12...r15 members - * of "struct pt_regs", and since interrupts can nest, we can save - * four stack slots and simultaneously provide - * an unwind-friendly stack layout by saving "truncated" pt_regs - * exactly up to rbp slot, without these members. - */ - ALLOC_PT_GPREGS_ON_STACK -RBP - SAVE_C_REGS -RBP - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ - SAVE_EXTRA_REGS_RBP -RBP - - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - - testb $3, CS-RBP(%rsp) - jz 1f - SWAPGS -1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rsi - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - - call \func - .endm - - /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_interrupt. - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ - interrupt do_IRQ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - - /* Restore saved previous stack */ - popq %rsi - /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp - - testb $3, CS(%rsp) - jz retint_kernel - /* Interrupt came from user space */ -retint_user: - GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ -retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi -retint_check: - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful - -retint_swapgs: /* return to user-space */ - /* - * The iretq could re-enable interrupts: - */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - SWAPGS - jmp restore_c_regs_and_iret - -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPT - /* Interrupts are off */ - /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ - jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq - jmp 0b -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: - INTERRUPT_RETURN - -ENTRY(native_iret) - /* - * Are we returning to a stack segment from the LDT? Note: in - * 64-bit mode SS:RSP on the exception stack is always valid. - */ -#ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt -#endif - -.global native_irq_return_iret -native_irq_return_iret: - /* - * This may fault. Non-paranoid faults on return to userspace are - * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. - * Other faults here are fatal. - */ - iretq - -#ifdef CONFIG_X86_ESPFIX64 -native_irq_return_ldt: - pushq %rax - pushq %rdi - SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq %rdi - orq PER_CPU_VAR(espfix_stack),%rax - SWAPGS - movq %rax,%rsp - popq %rax - jmp native_irq_return_iret -#endif - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - GET_THREAD_INFO(%rcx) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule - -END(common_interrupt) - -/* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -ENTRY(\sym) - ASM_CLAC - pushq $~(\num) -.Lcommon_\sym: - interrupt \do_sym - jmp ret_from_intr -END(\sym) -.endm - -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - -.macro apicinterrupt num sym do_sym -apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt -#endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt -#endif - -/* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) - -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 -ENTRY(\sym) - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 - .error "using shift_ist requires paranoid=1" - .endif - - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - - .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - ALLOC_PT_GPREGS_ON_STACK - - .if \paranoid - .if \paranoid == 1 - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ - .endif - call paranoid_entry - .else - call error_entry - .endif - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif - - .if \paranoid == 1 - /* - * Paranoid entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -1: - call error_entry - - - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - call \do_sym - - jmp error_exit /* %ebx: no swapgs flag */ - .endif -END(\sym) -.endm - -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ -ENTRY(native_load_gs_index) - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - SWAPGS -gs_change: - movl %edi,%gs -2: mfence /* workaround */ - SWAPGS - popfq - ret -END(native_load_gs_index) - - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - -/* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp,%rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq - leaveq - decl PER_CPU_VAR(irq_count) - ret -END(do_softirq_own_stack) - -#ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - -/* - * A note on the "critical region" in our callback handler. - * We want to avoid stacking callback handlers due to events occurring - * during handling of the last event. To do this, we keep events disabled - * until we've done all processing. HOWEVER, we must enable events before - * popping the stack frame (can't be done atomically) and so it would still - * be possible to get enough handler activations to overflow the stack. - * Although unlikely, bugs of that kind are hard to track down, so we'd - * like to avoid the possibility. - * So, on entry to the handler we detect whether we interrupted an - * existing activation in its critical region -- if so, we pop the current - * activation and restart the handler using the previous one. - */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) -/* - * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - * see the correct pointer to the pt_regs - */ - movq %rdi, %rsp # we don't return, adjust the stack frame -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp error_exit -END(xen_do_hypervisor_callback) - -/* - * Hypervisor uses this for application faults while it executes. - * We get here for two reasons: - * 1. Fault while reloading DS, ES, FS or GS - * 2. Fault while executing IRET - * Category 1 we do not need to fix up as Xen has already reloaded all segment - * registers that could be reloaded and zeroed the others. - * Category 2 we fix up by killing the current process. We cannot use the - * normal Linux return path in this case because if we use the IRET hypercall - * to pop the stack frame we end up in an infinite loop of failsafe callbacks. - * We distinguish between categories by comparing each saved segment register - * with its current contents: any discrepancy means we in category 1. - */ -ENTRY(xen_failsafe_callback) - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f - /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx - jmp general_protection -1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $-1 /* orig_ax = -1 => not a system call */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS - jmp error_exit -END(xen_failsafe_callback) - -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler -#endif /* CONFIG_HYPERV */ - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 -#endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 -#endif -#ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) -#endif - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(paranoid_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret -END(paranoid_entry) - -/* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs - TRACE_IRQS_IRETQ - SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN -END(paranoid_exit) - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(error_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx - testb $3, CS+8(%rsp) - jz error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - ret - - /* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ -error_kernelspace: - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti - -bstep_iret: - /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) - /* fall through */ - -error_bad_iret: - SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti -END(error_entry) - - -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(error_exit) - movl %ebx,%eax - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %eax,%eax - jnz retint_kernel - jmp retint_user -END(error_exit) - -/* Runs on exception stack */ -ENTRY(nmi) - PARAVIRT_ADJUST_EXCEPTION_FRAME - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. - * This means that we can have nested NMIs where the next - * NMI is using the top of the stack of the previous NMI. We - * can't let it execute because the nested NMI will corrupt the - * stack of the previous NMI. NMI handlers are not re-entrant - * anyway. - * - * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. - * The interrupted task's stack is also checked to see if it - * is an NMI stack. - * If the variable is not set and the stack is not the NMI - * stack then: - * o Set the special variable on the stack - * o Copy the interrupt frame into a "saved" location on the stack - * o Copy the interrupt frame into a "copy" location on the stack - * o Continue processing the NMI - * If the variable is set or the previous stack is the NMI stack: - * o Modify the "copy" location to jump to the repeate_nmi - * o return back to the first NMI - * - * Now on exit of the first NMI, we first clear the stack variable - * The NMI stack will tell any nested NMIs at that point that it is - * nested. Then we pop the stack normally with iret, and if there was - * a nested NMI that updated the copy interrupt stack frame, a - * jump will be made to the repeat_nmi code that will handle the second - * NMI. - */ - - /* Use %rdx as our temp variable throughout */ - pushq %rdx - - /* - * If %cs was not the kernel segment, then the NMI triggered in user - * space, which means it is definitely not nested. - */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi - - /* - * Check the special variable on the stack to see if NMIs are - * executing. - */ - cmpl $1, -8(%rsp) - je nested_nmi - - /* - * Now test if the previous stack was an NMI stack. - * We need the double check. We check the NMI stack to satisfy the - * race when the first NMI clears the variable before returning. - * We check the variable because the first NMI could be in a - * breakpoint routine using a breakpoint stack. - */ - lea 6*8(%rsp), %rdx - /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ - cmpq %rdx, 4*8(%rsp) - /* If the stack pointer is above the NMI stack, this is a normal NMI */ - ja first_nmi - subq $EXCEPTION_STKSZ, %rdx - cmpq %rdx, 4*8(%rsp) - /* If it is below the NMI stack, it is a normal NMI */ - jb first_nmi - /* Ah, it is within the NMI stack, treat it as nested */ - -nested_nmi: - /* - * Do nothing if we interrupted the fixup in repeat_nmi. - * It's about to repeat the NMI handler, so we are fine - * with ignoring this one. - */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out - -1: - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx - pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi - - /* Put stack back */ - addq $(6*8), %rsp - -nested_nmi_out: - popq %rdx - - /* No need to check faults here */ - INTERRUPT_RETURN - -first_nmi: - /* - * Because nested NMIs will use the pushed location that we - * stored in rdx, we must keep that space available. - * Here's what our stack frame will look like: - * +-------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +-------------------------+ - * | temp storage for rdx | - * +-------------------------+ - * | NMI executing variable | - * +-------------------------+ - * | copied SS | - * | copied Return RSP | - * | copied RFLAGS | - * | copied CS | - * | copied RIP | - * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ - * | pt_regs | - * +-------------------------+ - * - * The saved stack frame is used to fix up the copied stack frame - * that a nested NMI may change to make the interrupted NMI iret jump - * to the repeat_nmi. The original stack frame and the temp storage - * is also used by nested NMIs and can not be trusted on exit. - */ - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx - - /* Set the NMI executing variable on the stack. */ - pushq $1 - - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp - - /* Copy the stack frame to the Saved frame */ - .rept 5 - pushq 11*8(%rsp) - .endr - - /* Everything up to here is safe from nested NMIs */ - - /* - * If there was a nested NMI, the first NMI's iret will return - * here. But NMIs are still enabled and we can take another - * nested NMI. The nested NMI checks the interrupted RIP to see - * if it is between repeat_nmi and end_repeat_nmi, and if so - * it will just return, as we are about to repeat an NMI anyway. - * This makes it safe to copy to the stack frame that a nested - * NMI will update. - */ -repeat_nmi: - /* - * Update the stack variable to say we are still in NMI (the update - * is benign for the non-repeat case, where 1 was pushed just above - * to this very stack slot). - */ - movq $1, 10*8(%rsp) - - /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp - .rept 5 - pushq -6*8(%rsp) - .endr - subq $(5*8), %rsp -end_repeat_nmi: - - /* - * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception and reset our iret stack - * so that we repeat another NMI. - */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ - ALLOC_PT_GPREGS_ON_STACK - - /* - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit - * as we should not be calling schedule in NMI context. - * Even with normal interrupts enabled. An NMI should not be - * setting NEED_RESCHED or anything that normal interrupts and - * exceptions might do. - */ - call paranoid_entry - - /* - * Save off the CR2 register. If we take a page fault in the NMI then - * it could corrupt the CR2 value. If the NMI preempts a page fault - * handler before it was able to read the CR2 register, and then the - * NMI itself takes a page fault, the page fault that was preempted - * will read the information from the NMI page fault and not the - * origin fault. Save it off and restore it if it changes. - * Use the r12 callee-saved register. - */ - movq %cr2, %r12 - - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi - - /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 -1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore -nmi_swapgs: - SWAPGS_UNSAFE_STACK -nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - /* Pop the extra iret frame at once */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 - - /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) - jmp irq_return -END(nmi) - -ENTRY(ignore_sysret) - mov $-ENOSYS,%eax - sysret -END(ignore_sysret) - -- cgit v1.2.3 From 00398a0018d1334fedabfeaabd0fa563121de612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:41:06 +0200 Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 - arch/x86/kernel/syscall_32.c | 33 ---- arch/x86/kernel/syscall_64.c | 32 ---- arch/x86/kernel/vsyscall_64.c | 335 -------------------------------------- arch/x86/kernel/vsyscall_emu_64.S | 37 ----- arch/x86/kernel/vsyscall_gtod.c | 70 -------- arch/x86/kernel/vsyscall_trace.h | 29 ---- 7 files changed, 539 deletions(-) delete mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_emu_64.S delete mode 100644 arch/x86/kernel/vsyscall_gtod.c delete mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3ee054453d..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c deleted file mode 100644 index 3777189c4a19..000000000000 --- a/arch/x86/kernel/syscall_32.c +++ /dev/null @@ -1,33 +0,0 @@ -/* System call table for i386. */ - -#include -#include -#include -#include - -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), - -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c deleted file mode 100644 index 4ac730b37f0b..000000000000 --- a/arch/x86/kernel/syscall_64.c +++ /dev/null @@ -1,32 +0,0 @@ -/* System call table for x86-64. */ - -#include -#include -#include -#include -#include - -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; -#include -#undef __SYSCALL_64 - -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, - -extern void sys_ni_syscall(void); - -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c deleted file mode 100644 index 2dcc6ff6fdcc..000000000000 --- a/arch/x86/kernel/vsyscall_64.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2012-2014 Andy Lutomirski - * - * Based on the original implementation which is: - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Parts of the original code have been moved to arch/x86/vdso/vma.c - * - * This file implements vsyscall emulation. vsyscalls are a legacy ABI: - * Userspace can request certain kernel services by calling fixed - * addresses. This concept is problematic: - * - * - It interferes with ASLR. - * - It's awkward to write code that lives in kernel addresses but is - * callable by userspace at fixed addresses. - * - The whole concept is impossible for 32-bit compat userspace. - * - UML cannot easily virtualize a vsyscall. - * - * As of mid-2014, I believe that there is no new userspace code that - * will use a vsyscall if the vDSO is present. I hope that there will - * soon be no new userspace code that will ever use a vsyscall. - * - * The code in this file emulates vsyscalls when notified of a page - * fault to a vsyscall address. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "vsyscall_trace.h" - -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; - -static int __init vsyscall_setup(char *str) -{ - if (str) { - if (!strcmp("emulate", str)) - vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; - else if (!strcmp("none", str)) - vsyscall_mode = NONE; - else - return -EINVAL; - - return 0; - } - - return -EINVAL; -} -early_param("vsyscall", vsyscall_setup); - -static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, - const char *message) -{ - if (!show_unhandled_signals) - return; - - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); -} - -static int addr_to_vsyscall_nr(unsigned long addr) -{ - int nr; - - if ((addr & ~0xC00UL) != VSYSCALL_ADDR) - return -EINVAL; - - nr = (addr & 0xC00UL) >> 10; - if (nr >= 3) - return -EINVAL; - - return nr; -} - -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_nr = X86_TRAP_PF; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -{ - struct task_struct *tsk; - unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; - long ret; - - /* - * No point in checking CS -- the only way to get here is a user mode - * trap to a high address, which means that we're in 64-bit user code. - */ - - WARN_ON_ONCE(address != regs->ip); - - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); - return false; - } - - vsyscall_nr = addr_to_vsyscall_nr(address); - - trace_emulate_vsyscall(vsyscall_nr); - - if (vsyscall_nr < 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); - goto sigsegv; - } - - if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "vsyscall with bad stack (exploit attempt?)"); - goto sigsegv; - } - - tsk = current; - - /* - * Check for access_ok violations and find the syscall nr. - * - * NULL is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, NULL means "don't write anything" not "write it at - * address 0". - */ - switch (vsyscall_nr) { - case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - regs->orig_ax = -1; - if (tmp) - goto do_ret; /* skip requested */ - - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); - break; - - case 1: - ret = sys_time((time_t __user *)regs->di); - break; - - case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); - break; - } - - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - -check_fault: - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ - } - - regs->ax = ret; - -do_ret: - /* Emulate a ret instruction. */ - regs->ip = caller; - regs->sp += 8; - return true; - -sigsegv: - force_sig(SIGSEGV, current); - return true; -} - -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - if (vsyscall_mode == NONE) - return NULL; - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - -void __init map_vsyscall(void) -{ - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - - if (vsyscall_mode != NONE) - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); -} diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S deleted file mode 100644 index c9596a9af159..000000000000 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * vsyscall_emu_64.S: Vsyscall emulation page - * - * Copyright (c) 2011 Andy Lutomirski - * - * Subject to the GNU General Public License, version 2 - */ - -#include - -#include -#include -#include - -__PAGE_ALIGNED_DATA - .globl __vsyscall_page - .balign PAGE_SIZE, 0xcc - .type __vsyscall_page, @object -__vsyscall_page: - - mov $__NR_gettimeofday, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_time, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_getcpu, %rax - syscall - ret - - .balign 4096, 0xcc - - .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c deleted file mode 100644 index 51e330416995..000000000000 --- a/arch/x86/kernel/vsyscall_gtod.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include -#include -#include - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); - - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; - } - - gtod_write_end(vdata); -} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h deleted file mode 100644 index a8b2edec54fe..000000000000 --- a/arch/x86/kernel/vsyscall_trace.h +++ /dev/null @@ -1,29 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM vsyscall - -#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __VSYSCALL_TRACE_H - -#include - -TRACE_EVENT(emulate_vsyscall, - - TP_PROTO(int nr), - - TP_ARGS(nr), - - TP_STRUCT__entry(__field(int, nr)), - - TP_fast_assign( - __entry->nr = nr; - ), - - TP_printk("nr = %d", __entry->nr) -); - -#endif - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel -#define TRACE_INCLUDE_FILE vsyscall_trace -#include -- cgit v1.2.3 From 88d538672ea26223bca08225bc49f4e65e71683d Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:23 +0200 Subject: x86/mce: Add infrastructure to support Local MCE Initialize and prepare for handling LMCEs. Add a boot-time option to disable LMCEs. Signed-off-by: Ashok Raj [ Simplify stuff, align statements for better readability, reflow comments; kill unused lmce_clear(); save us an MSR write if LMCE is already enabled. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-16-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0cbcd3183acf..c8c6577b4ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1982,6 +1982,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -2005,6 +2006,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..2d872deb2c50 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,6 +435,19 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); -- cgit v1.2.3 From 243d657eaf540db882f73497060da5a4f7d86a90 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:24 +0200 Subject: x86/mce: Handle Local MCE events Add the necessary changes to do_machine_check() to be able to process MCEs signaled as local MCEs. Typically, only recoverable errors (SRAR type) will be Signaled as LMCE. The architecture does not restrict to only those errors, however. When errors are signaled as LMCE, there is no need for the MCE handler to perform rendezvous with other logical processors unlike earlier processors that would broadcast machine check errors. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c8c6577b4ada..ddc46d67d93e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1047,6 +1047,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1074,11 +1075,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1155,8 +1165,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2d872deb2c50..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -452,4 +452,5 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } -- cgit v1.2.3 From c8e56d20f2d190d54c0615775dcb6a23c1091681 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:25 +0200 Subject: x86: Kill CONFIG_X86_HT In talking to Aravind recently about making certain AMD topology attributes available to the MCE injection module, it seemed like that CONFIG_X86_HT thing is more or less superfluous. It is def_bool y, depends on SMP and gets enabled in the majority of .configs - distro and otherwise - out there. So let's kill it and make code behind it depend directly on SMP. Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Bartosz Golaszewski Cc: Catalin Marinas Cc: Daniel Walter Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Jacob Shin Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-18-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 6 +++--- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..eb4f01269b5d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -288,7 +288,7 @@ static int nearby_node(int apicid) * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; @@ -341,7 +341,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -420,7 +420,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) static void early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits, ecx; /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..b6fe2e47f7f1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -508,7 +508,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) void detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; static bool printed; @@ -844,7 +844,7 @@ static void generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index edcb0e28c336..be4febc58b94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; #endif @@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l3_id; #endif } -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP /* * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in -- cgit v1.2.3 From 2cd23553b488589f287457b7396470f5e3c40698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:28:07 +0200 Subject: x86/asm/entry: Rename compat syscall entry points Rename the following system call entry points: ia32_cstar_target -> entry_SYSCALL_compat ia32_syscall -> entry_INT80_compat The generic naming scheme for x86 system call entry points is: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index dcaab87da629..599afcf0005f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..f0b85c401014 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1207,7 +1207,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, system_call); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5e0791f9d3dc..edf97986a53d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -992,7 +992,7 @@ void __init trap_init(void) set_bit(i, used_vectors); #ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif -- cgit v1.2.3 From 4c8cd0c50d0b1559727bf0ec7ff27caeba2dfe09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:33:56 +0200 Subject: x86/asm/entry: Untangle 'ia32_sysenter_target' into two entry points: entry_SYSENTER_32 and entry_SYSENTER_compat So the SYSENTER instruction is pretty quirky and it has different behavior depending on bitness and CPU maker. Yet we create a false sense of coherency by naming it 'ia32_sysenter_target' in both of the cases. Split the name into its two uses: ia32_sysenter_target (32) -> entry_SYSENTER_32 ia32_sysenter_target (64) -> entry_SYSENTER_compat As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f0b85c401014..b2ae7cec33ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,7 +1026,7 @@ void enable_sep_cpu(void) (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); out: put_cpu(); @@ -1216,7 +1216,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); -- cgit v1.2.3 From b2502b418e63fcde0fe1857732a476b5aa3789b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:42:03 +0200 Subject: x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32 The 'system_call' entry points differ starkly between native 32-bit and 64-bit kernels: on 32-bit kernels it defines the INT 0x80 entry point, while on 64-bit it's the SYSCALL entry point. This is pretty confusing when looking at generic code, and it also obscures the nature of the entry point at the assembly level. So unangle this by splitting the name into its two uses: system_call (32) -> entry_INT80_32 system_call (64) -> entry_SYSCALL_64 As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b2ae7cec33ca..914be4bbc2e5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1204,7 +1204,7 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index edf97986a53d..001ddac221a1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,8 +72,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include - -asmlinkage int system_call(void); +#include #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -997,7 +996,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif -- cgit v1.2.3 From bace7117d3fb59a6ed7ea1aa6c8994df6a28a72a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 21:20:26 +0200 Subject: x86/asm/entry: (Re-)rename __NR_entry_INT80_compat_max to __NR_syscall_compat_max Brian Gerst noticed that I did a weird rename in the following commit: b2502b418e63 ("x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32") which renamed __NR_ia32_syscall_max to __NR_entry_INT80_compat_max. Now the original name was a misnomer, but the new one is a misnomer as well, as all the 32-bit compat syscall entry points (sysenter, syscall) share the system call table, not just the INT80 based one. Rename it to __NR_syscall_compat_max. Reported-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 599afcf0005f..d8f42f902a0f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; -- cgit v1.2.3 From b58d930750135d6c5b8e5aa084c0e9303c78c286 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 15 Jun 2015 17:40:01 +0800 Subject: x86/platform/intel/baytrail: Add comments about why we disabled HPET on Baytrail This question has been asked many times, and finally I found the official document which explains the problem of HPET on Baytrail, that it will halt in deep idle states. Signed-off-by: Feng Tang Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john.stultz@linaro.org Cc: len.brown@intel.com Cc: matthew.lee@intel.com Link: http://lkml.kernel.org/r/1434361201-31743-1-git-send-email-feng.tang@intel.com [ Prettified things a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fe9f0b79a18b..5cb9a4d6f623 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, /* - * HPET on current version of Baytrail platform has accuracy - * problems, disable it for now: + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, -- cgit v1.2.3 From bafac298fb20e9ae1305c710d4fd8d20c5911afa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Sat, 20 Jun 2015 11:50:50 +0200 Subject: x86/hpet: Check for irq==0 when allocating hpet MSI interrupts irq == 0 is not a valid irq for a irqdomain MSI allocation, but hpet code checks only for negative return values. Reported-by: Sergey Senozhatsky Cc: Borislav Petkov Link: http://lkml.kernel.org/r/558447AF.30703@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e2449cf38b06..c47aab35a17e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -578,7 +578,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) continue; irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); - if (irq < 0) + if (irq <= 0) continue; sprintf(hdev->name, "hpet%d", i); -- cgit v1.2.3 From cb17b2a674f2059343f997599b4b001e64eec516 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 21 Jun 2015 16:21:50 +0200 Subject: x86/hpet: Use proper hpet device number for MSI allocation hpet_assign_irq() is called with hpet_device->num as "hardware interrupt number", but hpet_device->num is initialized after the interrupt has been assigned, so it's always 0. As a consequence only the first MSI allocation succeeds, the following ones fail because the "hardware interrupt number" already exists. Move the initialization of dev->num and other fields before the call to hpet_assign_irq(), which is the ordering before the offending commit which introduced that regression. Fixes: "3cb96f0c9733 x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains" Reported-by: Sergey Senozhatsky Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1506211635010.4107@nanos Cc: Jiang Liu Cc: Borislav Petkov --- arch/x86/kernel/hpet.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c47aab35a17e..10757d0a3fcf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -577,16 +577,17 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); if (irq <= 0) continue; - sprintf(hdev->name, "hpet%d", i); - hdev->num = i; hdev->irq = irq; - hdev->flags = 0; - if (cfg & HPET_TN_PERIODIC_CAP) - hdev->flags |= HPET_DEV_PERI_CAP; hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; -- cgit v1.2.3