From 164477c2331be75d9bd57fb76704e676b2bcd1cd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:20 -0700 Subject: x86/mm: Clarify hardware vs. software "error_code" We pass around a variable called "error_code" all around the page fault code. Sounds simple enough, especially since "error_code" looks like it exactly matches the values that the hardware gives us on the stack to report the page fault error code (PFEC in SDM parlance). But, that's not how it works. For part of the page fault handler, "error_code" does exactly match PFEC. But, during later parts, it diverges and starts to mean something a bit different. Give it two names for its two jobs. The place it diverges is also really screwy. It's only in a spot where the hardware tells us we have kernel-mode access that occurred while we were in usermode accessing user-controlled address space. Add a warning in there. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160220.4A2272C9@viggo.jf.intel.com --- arch/x86/mm/fault.c | 77 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 47bebfe6efa7..cd08f4fef836 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1208,9 +1208,10 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * routines. */ static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long error_code, +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { + unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1236,17 +1237,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * nothing more. * * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. + * (hw_error_code & 4) == 0, and that the fault was not a + * protection error (hw_error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(error_code, address)) + if (spurious_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ @@ -1256,7 +1257,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } @@ -1265,11 +1266,11 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & X86_PF_RSVD)) - pgtable_bad(regs, error_code, address); + if (unlikely(hw_error_code & X86_PF_RSVD)) + pgtable_bad(regs, hw_error_code, address); - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address, NULL); + if (unlikely(smap_violation(hw_error_code, regs))) { + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } @@ -1278,10 +1279,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } + /* + * hw_error_code is literally the "page fault error code" passed to + * the kernel directly from the hardware. But, we will shortly be + * modifying it in software, so give it a new name. + */ + sw_error_code = hw_error_code; + /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1291,7 +1299,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= X86_PF_USER; + /* + * Up to this point, X86_PF_USER set in hw_error_code + * indicated a user-mode access. But, after this, + * X86_PF_USER in sw_error_code will indicate either + * that, *or* an implicit kernel(supervisor)-mode access + * which originated from user mode. + */ + if (!(hw_error_code & X86_PF_USER)) { + /* + * The CPU was in user mode, but the CPU says + * the fault was not a user-mode access. + * Must be an implicit kernel-mode access, + * which we do not expect to happen in the + * user address space. + */ + pr_warn_once("kernel-mode error from user-mode: %lx\n", + hw_error_code); + + sw_error_code |= X86_PF_USER; + } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1300,9 +1327,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & X86_PF_WRITE) + if (sw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & X86_PF_INSTR) + if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1322,9 +1349,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(error_code & X86_PF_USER) && + if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, sw_error_code, address, NULL); return; } retry: @@ -1340,16 +1367,16 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } - if (error_code & X86_PF_USER) { + if (sw_error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter @@ -1357,12 +1384,12 @@ retry: * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } @@ -1371,8 +1398,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(error_code, vma))) { - bad_area_access_error(regs, error_code, address, vma); + if (unlikely(access_error(sw_error_code, vma))) { + bad_area_access_error(regs, sw_error_code, address, vma); return; } @@ -1414,13 +1441,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, &pkey, fault); + mm_fault_error(regs, sw_error_code, address, &pkey, fault); return; } -- cgit v1.2.3 From 8fed62000039058adfd8b663344e2f448aed1e7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:22 -0700 Subject: x86/mm: Break out kernel address space handling The page fault handler (__do_page_fault()) basically has two sections: one for handling faults in the kernel portion of the address space and another for faults in the user portion of the address space. But, these two parts don't stick out that well. Let's make that more clear from code separation and naming. Pull kernel fault handling into its own helper, and reflect that naming by renaming spurious_fault() -> spurious_kernel_fault(). Also, rewrite the vmalloc() handling comment a bit. It was a bit stale and also glossed over the reserved bit handling. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160222.401F4E10@viggo.jf.intel.com --- arch/x86/mm/fault.c | 101 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 39 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index cd08f4fef836..c7e32f453852 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1032,7 +1032,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } } -static int spurious_fault_check(unsigned long error_code, pte_t *pte) +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; @@ -1071,7 +1071,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * (Optional Invalidation). */ static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; @@ -1102,27 +1102,27 @@ spurious_fault(unsigned long error_code, unsigned long address) return 0; if (p4d_large(*p4d)) - return spurious_fault_check(error_code, (pte_t *) p4d); + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) - return spurious_fault_check(error_code, (pte_t *) pud); + return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) - return spurious_fault_check(error_code, (pte_t *) pmd); + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - ret = spurious_fault_check(error_code, pte); + ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; @@ -1130,12 +1130,12 @@ spurious_fault(unsigned long error_code, unsigned long address) * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ - ret = spurious_fault_check(error_code, (pte_t *) pmd); + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } -NOKPROBE_SYMBOL(spurious_fault); +NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; @@ -1202,6 +1202,58 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) return true; } +/* + * Called for all faults where 'address' is part of the kernel address + * space. Might get called for faults that originate from *code* that + * ran in userspace or the kernel. + */ +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Was the fault spurious, caused by lazy TLB invalidation? */ + if (spurious_kernel_fault(hw_error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (kprobes_fault(regs)) + return; + + /* + * Note, despite being a "bad area", there are quite a few + * acceptable reasons to get here, such as erratum fixups + * and handling kernel code that can fault, like get_user(). + * + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, hw_error_code, address, NULL); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -1227,38 +1279,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, if (unlikely(kmmio_fault(regs, address))) return; - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (hw_error_code & 4) == 0, and that the fault was not a - * protection error (hw_error_code & 9) == 0. - */ + /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(hw_error_code, address)) - return; - - /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock: - */ - bad_area_nosemaphore(regs, hw_error_code, address, NULL); - + do_kern_addr_fault(regs, hw_error_code, address); return; } -- cgit v1.2.3 From aa37c51b9421d66f7931c5fdcb9ce80c450974be Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:23 -0700 Subject: x86/mm: Break out user address space handling The last patch broke out kernel address space handing into its own helper. Now, do the same for user address space handling. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160223.9C4F6440@viggo.jf.intel.com --- arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c7e32f453852..0d1f5d39fc63 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -966,6 +966,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, vma, SEGV_ACCERR); } +/* Handle faults in the kernel portion of the address space */ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) @@ -1254,14 +1255,11 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) +/* Handle faults in the user portion of the address space */ +static inline +void do_user_addr_fault(struct pt_regs *regs, + unsigned long hw_error_code, + unsigned long address) { unsigned long sw_error_code; struct vm_area_struct *vma; @@ -1274,17 +1272,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; - - /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) { - do_kern_addr_fault(regs, hw_error_code, address); - return; - } - /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobes_fault(regs))) return; @@ -1488,6 +1475,28 @@ good_area: check_v8086_mode(regs, address, tsk); } +NOKPROBE_SYMBOL(do_user_addr_fault); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + prefetchw(¤t->mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); +} NOKPROBE_SYMBOL(__do_page_fault); static nokprobe_inline void -- cgit v1.2.3 From 5b0c2cac54d44ea40d5c5aec7cbf14353b2a903e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:25 -0700 Subject: x86/mm: Add clarifying comments for user addr space The SMAP and Reserved checking do not have nice comments. Add some to clarify and make it match everything else. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160225.FFD44B8D@viggo.jf.intel.com --- arch/x86/mm/fault.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0d1f5d39fc63..1d838701a5f7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1276,9 +1276,17 @@ void do_user_addr_fault(struct pt_regs *regs, if (unlikely(kprobes_fault(regs))) return; + /* + * Reserved bits are never expected to be set on + * entries in the user portion of the page tables. + */ if (unlikely(hw_error_code & X86_PF_RSVD)) pgtable_bad(regs, hw_error_code, address); + /* + * Check for invalid kernel (supervisor) access to user + * pages in the user address space. + */ if (unlikely(smap_violation(hw_error_code, regs))) { bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; -- cgit v1.2.3 From 88259744e253777e898c186f08670c86dd8199bf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:27 -0700 Subject: x86/mm: Fix exception table comments The comments here are wrong. They are too absolute about where faults can occur when running in the kernel. The comments are also a bit hard to match up with the code. Trim down the comments, and make them more precise. Also add a comment explaining why we are doing the bad_area_nosemaphore() path here. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160227.077DDD7A@viggo.jf.intel.com --- arch/x86/mm/fault.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1d838701a5f7..57b074b02ebb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1351,24 +1351,26 @@ void do_user_addr_fault(struct pt_regs *regs, flags |= FAULT_FLAG_INSTRUCTION; /* - * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in - * the kernel and should generate an OOPS. Unfortunately, in the - * case of an erroneous fault occurring in a code path which already - * holds mmap_sem we will deadlock attempting to validate the fault - * against the address space. Luckily the kernel only validly - * references user space from well defined areas of code, which are - * listed in the exceptions table. + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception + * tables. But, an erroneous kernel fault occurring outside one of + * those areas which also holds mmap_sem might deadlock attempting + * to validate the fault against the address space. * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a - * deadlock. Attempt to lock the address space, if we cannot we then - * validate the source. If this is invalid we can skip the address - * space check, thus avoiding the deadlock: + * Only do the expensive exception table search when we might be at + * risk of a deadlock. This happens if we + * 1. Failed to acquire mmap_sem, and + * 2. The access did not originate in userspace. Note: either the + * hardware or earlier page fault code may set X86_PF_USER + * in sw_error_code. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { + /* + * Fault from code in kernel from + * which we do not expect faults. + */ bad_area_nosemaphore(regs, sw_error_code, address, NULL); return; } -- cgit v1.2.3 From 02e983b760c0d4183c28d625a3c99016e2cd8a7f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:28 -0700 Subject: x86/mm: Add vsyscall address helper We will shortly be using this check in two locations. Put it in a helper before we do so. Let's also insert PAGE_MASK instead of the open-coded ~0xfff. It is easier to read and also more obviously correct considering the implicit type conversion that has to happen when it is not an implicit 'unsigned long'. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160228.C593509B@viggo.jf.intel.com --- arch/x86/mm/fault.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 57b074b02ebb..7a627ac3a0d2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -840,6 +840,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return (vaddr & PAGE_MASK) == VSYSCALL_ADDR; +} + static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, int si_code) @@ -869,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * emulation. */ if (unlikely((error_code & X86_PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_ADDR))) { + is_vsyscall_vaddr(address))) { if (emulate_vsyscall(regs, address)) return; } -- cgit v1.2.3 From 3ae0ad92f53e0f05cf6ab781230b7902b88f73cd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:30 -0700 Subject: x86/mm/vsyscall: Consider vsyscall page part of user address space The vsyscall page is weird. It is in what is traditionally part of the kernel address space. But, it has user permissions and we handle faults on it like we would on a user page: interrupts on. Right now, we handle vsyscall emulation in the "bad_area" code, which is used for both user-address-space and kernel-address-space faults. Move the handling to the user-address-space code *only* and ensure we get there by "excluding" the vsyscall page from the kernel address space via a check in fault_in_kernel_space(). Since the fault_in_kernel_space() check is used on 32-bit, also add a 64-bit check to make it clear we only use this path on 64-bit. Also move the unlikely() to be in is_vsyscall_vaddr() itself. This helps clean up the kernel fault handling path by removing a case that can happen in normal[1] operation. (Yeah, yeah, we can argue about the vsyscall page being "normal" or not.) This also makes sanity checks easier, like the "we never take pkey faults in the kernel address space" check in the next patch. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160230.6E9336EE@viggo.jf.intel.com --- arch/x86/mm/fault.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a627ac3a0d2..7e0fa7e24168 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -846,7 +846,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, */ static bool is_vsyscall_vaddr(unsigned long vaddr) { - return (vaddr & PAGE_MASK) == VSYSCALL_ADDR; + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); } static void @@ -872,18 +872,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; -#ifdef CONFIG_X86_64 - /* - * Instruction fetch faults in the vsyscall page might need - * emulation. - */ - if (unlikely((error_code & X86_PF_INSTR) && - is_vsyscall_vaddr(address))) { - if (emulate_vsyscall(regs, address)) - return; - } -#endif - /* * To avoid leaking information about the kernel page table * layout, pretend that user-mode accesses to kernel addresses @@ -1192,6 +1180,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { + /* + * On 64-bit systems, the vsyscall page is at an address above + * TASK_SIZE_MAX, but is not considered part of the kernel + * address space. + */ + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) + return false; + return address >= TASK_SIZE_MAX; } @@ -1359,6 +1355,22 @@ void do_user_addr_fault(struct pt_regs *regs, if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. The vsyscall page is at a high address + * (>PAGE_OFFSET), but is considered to be part of the user + * address space. + * + * The vsyscall page does not have a "real" VMA, so do this + * emulation before we go searching for VMAs. + */ + if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + /* * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception -- cgit v1.2.3 From 367e3f1d3fc9bbf1e626da7aea527f40babf8079 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 28 Sep 2018 09:02:31 -0700 Subject: x86/mm: Remove spurious fault pkey check Spurious faults only ever occur in the kernel's address space. They are also constrained specifically to faults with one of these error codes: X86_PF_WRITE | X86_PF_PROT X86_PF_INSTR | X86_PF_PROT So, it's never even possible to reach spurious_kernel_fault_check() with X86_PF_PK set. In addition, the kernel's address space never has pages with user-mode protections. Protection Keys are only enforced on pages with user-mode protection. This gives us lots of reasons to not check for protection keys in our sprurious kernel fault handling. But, let's also add some warnings to ensure that these assumptions about protection keys hold true. Cc: x86@kernel.org Cc: Jann Horn Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20180928160231.243A0D6A@viggo.jf.intel.com --- arch/x86/mm/fault.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7e0fa7e24168..a16652982f98 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1037,12 +1037,6 @@ static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; - /* - * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set X86_PF_PK. - */ - if ((error_code & X86_PF_PK)) - return 1; return 1; } @@ -1217,6 +1211,13 @@ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { + /* + * Protection keys exceptions only happen on user pages. We + * have no user pages in the kernel portion of the address + * space, so do not expect them here. + */ + WARN_ON_ONCE(hw_error_code & X86_PF_PK); + /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. -- cgit v1.2.3 From 162041425193602b15774c61740ad8e7dc157df3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 19 Oct 2018 07:08:42 -0700 Subject: x86/mm: Kill stray kernel fault handling comment I originally had matching user and kernel comments, but the kernel one got improved. Some errant conflict resolution kicked the commment somewhere wrong. Kill it. Reported-by: Eric W. Biederman Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Jann Horn Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sean Christopherson Cc: Thomas Gleixner Fixes: aa37c51b94 ("x86/mm: Break out user address space handling") Link: http://lkml.kernel.org/r/20181019140842.12F929FA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a16652982f98..bd047438d23d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, vma, SEGV_ACCERR); } -/* Handle faults in the kernel portion of the address space */ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) -- cgit v1.2.3