diff options
author | David S. Miller <davem@davemloft.net> | 2018-01-09 10:37:00 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-01-09 10:37:00 -0500 |
commit | a0ce093180f2bbb832b3f5583adc640ad67ea568 (patch) | |
tree | 62c9aca5be3566ea4810e4584a02870de8b953f8 /arch/x86/kernel | |
parent | f4803f1b73f877a571be4c8e531dfcf190acc691 (diff) | |
parent | ef7f8cec80a0ba7bd00ece46844c8994117dc910 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/apic/apic.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/apic_flat_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/apic_noop.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/io_apic.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/msi.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/apic/probe_32.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/vector.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/apic/x2apic_cluster.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack.c | 37 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 30 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 151 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_32.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/stacktrace.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/tls.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux.lds.S | 8 |
21 files changed, 259 insertions, 61 deletions
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e272f3ea984..880441f24146 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 else { pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } +#endif return 0; } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690e9b64..25a87028cb3f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4480c9..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 201579dc5242..8a7963421460 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, } int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { unsigned long flags; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017de806..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 750449152b04..f8b03bb8e725 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd) irq_matrix_reserve(vector_matrix); apicd->can_reserve = true; apicd->has_reserved = true; + irqd_set_can_reserve(irqd); trace_vector_reserve(irqd->irq, 0); vector_assign_managed_shutdown(irqd); } @@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd) int ret; ret = assign_irq_vector_any_locked(irqd); - if (!ret) + if (!ret) { apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } return ret; } @@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd) } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, - bool early) + bool reserve) { struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; int ret = 0; trace_vector_activate(irqd->irq, apicd->is_managed, - apicd->can_reserve, early); + apicd->can_reserve, reserve); /* Nothing to do for fixed assigned vectors */ if (!apicd->can_reserve && !apicd->is_managed) return 0; raw_spin_lock_irqsave(&vector_lock, flags); - if (early || irqd_is_managed_and_shutdown(irqd)) + if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); @@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, } else { /* Release the vector */ apicd->can_reserve = true; + irqd_set_can_reserve(irqd); clear_irq_vector(irqd); realloc = true; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 622f13ca8a94..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 676b7cf4b62b..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@ #include <asm/sigframe.h> #include <asm/bootparam.h> #include <asm/suspend.h> +#include <asm/tlbflush.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -94,6 +95,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9757f07d738..39d7ea865207 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + fpu__init_system(c); #ifdef CONFIG_X86_32 @@ -1360,7 +1364,10 @@ void syscall_init(void) (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 36b17e0febe8..afbecff161d1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs) regs->sp, regs->flags); } -static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial) { - if (on_stack(info, regs, sizeof(*regs))) + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { __show_regs(regs, 0); - else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, - IRET_FRAME_SIZE)) { + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { /* * When an interrupt or exception occurs in entry code, the * full pt_regs might not have been saved yet. In that case @@ -98,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct stack_info stack_info = {0}; unsigned long visit_mask = 0; int graph_idx = 0; + bool partial; printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); stack = stack ? : get_stack_pointer(task, regs); + regs = unwind_get_entry_regs(&state, &partial); /* * Iterate through the stacks, starting with the current stack pointer. @@ -120,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; if (get_stack_info(stack, task, &stack_info, &visit_mask)) { @@ -140,7 +153,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); /* * Scan the stack, printing any text addresses we find. At the @@ -164,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by show_regs_safe() below. + * by show_regs_if_on_stack(). */ if (regs && stack == ®s->ip) goto next; @@ -199,9 +212,9 @@ next: unwind_next_frame(&state); /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); } if (stack_name) @@ -297,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -350,13 +371,14 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a6b5d62f45a7..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -24,6 +24,7 @@ #include <linux/uaccess.h> #include <asm/ldt.h> +#include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/syscalls.h> @@ -51,13 +52,11 @@ static void refresh_ldt_segments(void) static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { @@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; @@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index aed9d94bd46f..832a6acd730f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8af2e8d0c0a1..145810b0edf6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -906,9 +906,6 @@ void __init setup_arch(char **cmdline_p) set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } - - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -962,6 +959,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c5970efa8557..ed556d50d7ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -126,14 +126,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; - pr_debug("2.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; - pr_debug("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) @@ -141,11 +137,6 @@ static inline void smpboot_restore_warm_reset_vector(void) unsigned long flags; /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* * Paranoid: Set warm reset code and vector here back * to default values. */ diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 77835bc021c7..093f2ea5dd56 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, NULL); if (regs) { /* * Kernel mode registers on the stack indicate an @@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, { int ret; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ if (!try_get_task_stack(tsk)) - return -EINVAL; + return 0; ret = __save_stack_trace_reliable(trace, tsk); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { + if (LDT_empty(info) || LDT_zero(info)) memset(desc, 0, sizeof(*desc)); - } else { + else fill_ldt(desc, info); - - /* - * Always set the accessed bit so that the CPU - * doesn't try to write to the (read-only) GDT. - */ - desc->type |= 1; - } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f69dbd47d733..446c9ef8cfc3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d2a8b5a24a44..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies; . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + #else #define X64_ALIGN_RODATA_BEGIN #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END + #endif PHDRS { @@ -102,8 +108,10 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT IRQENTRY_TEXT + ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) |