diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-14 17:13:53 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-14 17:13:53 -0800 |
commit | 1ac0884d5474fea8dc6ceabbd0e870d1bf4b7b42 (patch) | |
tree | 5158d582c0f5040a9c8a28fe9051881c5ec7b5eb /arch | |
parent | ff6135959a9150ad45cb92ca38da270903a74343 (diff) | |
parent | c6156e1da633f241e132eaea3b676d674376d770 (diff) |
Merge tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull core entry/exit updates from Thomas Gleixner:
"A set of updates for entry/exit handling:
- More generalization of entry/exit functionality
- The consolidation work to reclaim TIF flags on x86 and also for
non-x86 specific TIF flags which are solely relevant for syscall
related work and have been moved into their own storage space. The
x86 specific part had to be merged in to avoid a major conflict.
- The TIF_NOTIFY_SIGNAL work which replaces the inefficient signal
delivery mode of task work and results in an impressive performance
improvement for io_uring. The non-x86 consolidation of this is
going to come seperate via Jens.
- The selective syscall redirection facility which provides a clean
and efficient way to support the non-Linux syscalls of WINE by
catching them at syscall entry and redirecting them to the user
space emulation. This can be utilized for other purposes as well
and has been designed carefully to avoid overhead for the regular
fastpath. This includes the core changes and the x86 support code.
- Simplification of the context tracking entry/exit handling for the
users of the generic entry code which guarantee the proper ordering
and protection.
- Preparatory changes to make the generic entry code accomodate S390
specific requirements which are mostly related to their syscall
restart mechanism"
* tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
entry: Add syscall_exit_to_user_mode_work()
entry: Add exit_to_user_mode() wrapper
entry_Add_enter_from_user_mode_wrapper
entry: Rename exit_to_user_mode()
entry: Rename enter_from_user_mode()
docs: Document Syscall User Dispatch
selftests: Add benchmark for syscall user dispatch
selftests: Add kselftest for syscall user dispatch
entry: Support Syscall User Dispatch on common syscall entry
kernel: Implement selective syscall userspace redirection
signal: Expose SYS_USER_DISPATCH si_code type
x86: vdso: Expose sigreturn address on vdso to the kernel
MAINTAINERS: Add entry for common entry code
entry: Fix boot for !CONFIG_GENERIC_ENTRY
x86: Support HAVE_CONTEXT_TRACKING_OFFSTACK
context_tracking: Only define schedule_user() on !HAVE_CONTEXT_TRACKING_OFFSTACK archs
sched: Detect call to schedule from critical entry code
context_tracking: Don't implement exception_enter/exit() on CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK
context_tracking: Introduce HAVE_CONTEXT_TRACKING_OFFSTACK
x86: Reclaim unused x86 TI flags
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/Kconfig | 17 | ||||
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/entry/common.c | 34 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso2c.c | 2 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32/sigreturn.S | 2 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/elf.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/idtentry.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 13 | ||||
-rw-r--r-- | arch/x86/include/asm/vdso.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/signal_compat.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 13 |
15 files changed, 60 insertions, 62 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index ba4e966484ab..7a3371d28508 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -618,6 +618,23 @@ config HAVE_CONTEXT_TRACKING protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on irq exit still need to be protected. +config HAVE_CONTEXT_TRACKING_OFFSTACK + bool + help + Architecture neither relies on exception_enter()/exception_exit() + nor on schedule_user(). Also preempt_schedule_notrace() and + preempt_schedule_irq() can't be called in a preemptible section + while context tracking is CONTEXT_USER. This feature reflects a sane + entry implementation where the following requirements are met on + critical entry code, ie: before user_exit() or after user_enter(): + + - Critical entry code isn't preemptible (or better yet: + not interruptible). + - No use of RCU read side critical sections, unless rcu_nmi_enter() + got called. + - No use of instrumentation, unless instrumentation_begin() got + called. + config HAVE_TIF_NOHZ bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52e36adb5112..88a4fa909766 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -163,6 +163,7 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..18d8f17f755c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -noinstr bool idtentry_enter_nmi(struct pt_regs *regs) -{ - bool irq_state = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - rcu_nmi_enter(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - instrumentation_end(); - - return irq_state; -} - -noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) -{ - instrumentation_begin(); - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - } - instrumentation_end(); - - rcu_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 7380908045c7..2d0f3d8bcc25 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,8 @@ struct vdso_sym required_syms[] = { {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, + {"vdso32_rt_sigreturn_landing_pad", true}, + {"vdso32_sigreturn_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index c3233ee98a6b..1bd068f72d4c 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -18,6 +18,7 @@ __kernel_sigreturn: movl $__NR_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_sigreturn: +SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_sigreturn,.-.LSTART_sigreturn @@ -29,6 +30,7 @@ __kernel_rt_sigreturn: movl $__NR_rt_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: +SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 50e5d3a2e70a..de60cd37070b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -436,6 +436,21 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif +bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + const struct vdso_image *image = current->mm->context.vdso_image; + unsigned long vdso = (unsigned long) current->mm->context.vdso; + + if (in_ia32_syscall() && image == &vdso_image_32) { + if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || + regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) + return true; + } +#endif + return false; +} + #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 44a9b9940535..66bdfe838d61 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -388,6 +388,8 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, compat_arch_setup_additional_pages(bprm, interpreter, \ (ex->e_machine == EM_X86_64)) +extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); + /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2442eb0ac2f..247a60a47331 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,9 +11,6 @@ #include <asm/irq_stack.h> -bool idtentry_enter_nmi(struct pt_regs *regs); -void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a12b9644193b..0d751d5da702 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,6 +55,7 @@ struct task_struct; struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ }; @@ -74,15 +75,11 @@ struct thread_info { * - these are process state flags that various assembly files * may need to access */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ -#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -91,6 +88,7 @@ struct thread_info { #define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ +#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -98,18 +96,13 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) @@ -118,13 +111,13 @@ struct thread_info { #define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) /* flags to check in __switch_to() */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index b5d23470f56b..98aa103eb4ab 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -29,6 +29,8 @@ struct vdso_image { long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; long sym_int80_landing_pad; + long sym_vdso32_sigreturn_landing_pad; + long sym_vdso32_rt_sigreturn_landing_pad; }; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6af6a3c0698f..13d3f1cbda17 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1974,7 +1974,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - bool irq_state; + irqentry_state_t irq_state; WARN_ON_ONCE(user_mode(regs)); @@ -1986,7 +1986,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -1997,7 +1997,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bc77aaf1303..bf250a339655 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - bool irq_state; + irqentry_state_t irq_state; /* * Re-enable NMIs right here when running as an SEV-ES guest. This might @@ -502,14 +502,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index be0d7d4152ec..ea794a083c44 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; - if (get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ddfd919be46c..a5330ff498f0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 1); + BUILD_BUG_ON(NSIGSYS != 2); /* This is part of the ABI and can never change in size: */ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7798d862983f..fb55981f2a0d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -409,7 +409,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - idtentry_enter_nmi(regs); + irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -658,12 +658,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } } @@ -858,7 +859,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* @@ -915,7 +916,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } @@ -933,7 +934,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* * NB: We can't easily clear DR7 here because - * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be |