summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c26
-rw-r--r--arch/x86/kernel/amd_gart_64.c3
-rw-r--r--arch/x86/kernel/amd_nb.c5
-rw-r--r--arch/x86/kernel/apic/apic.c41
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/msi.c3
-rw-r--r--arch/x86/kernel/apic/vector.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/acrn.c9
-rw-r--r--arch/x86/kernel/cpu/bugs.c200
-rw-r--r--arch/x86/kernel/cpu/common.c77
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c238
-rw-r--r--arch/x86/kernel/cpu/mce/core.c206
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c8
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c4
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h12
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c8
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c6
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c5
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c5
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c8
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c22
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c6
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c6
-rw-r--r--arch/x86/kernel/crash_core_32.c2
-rw-r--r--arch/x86/kernel/crash_core_64.c2
-rw-r--r--arch/x86/kernel/doublefault_32.c11
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c7
-rw-r--r--arch/x86/kernel/e820.c10
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/ftrace_64.S2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_64.S9
-rw-r--r--arch/x86/kernel/hw_breakpoint.c100
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/idt.c226
-rw-r--r--arch/x86/kernel/irq.c66
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/irq_work.c6
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvm.c16
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/nmi.c75
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process.c28
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/ptrace.c1
-rw-r--r--arch/x86/kernel/reboot.c10
-rw-r--r--arch/x86/kernel/smp.c37
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/sys_ia32.c40
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tracepoint.c17
-rw-r--r--arch/x86/kernel/traps.c548
-rw-r--r--arch/x86/kernel/unwind_frame.c8
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S5
73 files changed, 1297 insertions, 918 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8ef4369a4f06..e77261db2391 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,6 +28,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
+# With some compiler versions the generated code results in boot hangs, caused
+# by several compilation units. To be safe, disable all instrumentation.
+KCSAN_SANITIZE := n
+
OBJECT_FILES_NON_STANDARD_test_nx.o := y
OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 683ed9e12e6b..7bdc0239a943 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -20,11 +20,11 @@
#include <linux/pci.h>
#include <linux/efi-bgrt.h>
#include <linux/serial_core.h>
+#include <linux/pgtable.h>
#include <asm/e820/api.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
-#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/io.h>
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ed3b04483972..cc1fea76aab0 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,9 +10,9 @@
#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cd617979b7fc..8fd39ff74a49 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,7 +18,6 @@
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/cacheflush.h>
@@ -1012,28 +1011,29 @@ struct bp_patching_desc {
static struct bp_patching_desc *bp_desc;
-static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+static __always_inline
+struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
- struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
+ struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
- if (!desc || !atomic_inc_not_zero(&desc->refs))
+ if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
return desc;
}
-static inline void put_desc(struct bp_patching_desc *desc)
+static __always_inline void put_desc(struct bp_patching_desc *desc)
{
smp_mb__before_atomic();
- atomic_dec(&desc->refs);
+ arch_atomic_dec(&desc->refs);
}
-static inline void *text_poke_addr(struct text_poke_loc *tp)
+static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
{
return _stext + tp->rel_addr;
}
-static int notrace patch_cmp(const void *key, const void *elt)
+static __always_inline int patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
@@ -1043,9 +1043,8 @@ static int notrace patch_cmp(const void *key, const void *elt)
return 1;
return 0;
}
-NOKPROBE_SYMBOL(patch_cmp);
-int notrace poke_int3_handler(struct pt_regs *regs)
+int noinstr poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
@@ -1078,9 +1077,9 @@ int notrace poke_int3_handler(struct pt_regs *regs)
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(desc->nr_entries > 1)) {
- tp = bsearch(ip, desc->vec, desc->nr_entries,
- sizeof(struct text_poke_loc),
- patch_cmp);
+ tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
+ sizeof(struct text_poke_loc),
+ patch_cmp);
if (!tp)
goto out_put;
} else {
@@ -1119,7 +1118,6 @@ out_put:
put_desc(desc);
return ret;
}
-NOKPROBE_SYMBOL(poke_int3_handler);
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 16133819415c..17cb5b933dcf 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -33,7 +33,6 @@
#include <linux/atomic.h>
#include <linux/dma-direct.h>
#include <asm/mtrr.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -159,7 +158,7 @@ static void dump_leak(void)
return;
dump = 1;
- show_stack(NULL, NULL);
+ show_stack(NULL, NULL, KERN_ERR);
debug_dma_dump_mappings(NULL);
}
#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index b6b3297851f3..18f6b7c4bd79 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -18,9 +18,11 @@
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
+#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
@@ -33,6 +35,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
{}
};
@@ -50,6 +53,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
@@ -65,6 +69,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4b1d31be50b4..e0e2f020ec02 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1088,23 +1088,14 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- *
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- entering_ack_irq();
+ ack_APIC_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -2060,7 +2051,7 @@ void __init init_apic_mappings(void)
unsigned int new_apicid;
if (apic_validate_deadline_timer())
- pr_debug("TSC deadline timer available\n");
+ pr_info("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
@@ -2120,15 +2111,21 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ *
+ * Also called from sysvec_spurious_apic_interrupt().
*/
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
- u8 vector = ~regs->orig_ax;
u32 v;
- entering_irq();
trace_spurious_apic_entry(vector);
inc_irq_stat(irq_spurious_count);
@@ -2158,13 +2155,17 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
}
out:
trace_spurious_apic_exit(vector);
- exiting_irq();
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
+{
+ __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
{
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -2178,7 +2179,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
};
u32 v, i = 0;
- entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
@@ -2202,7 +2202,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT "\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
- exiting_irq();
}
/**
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index cdf45b4700f2..35edd57f064a 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -12,11 +12,11 @@
*/
#include <linux/types.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
-#include <asm/pgtable.h>
#include "local.h"
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 159bd0cb8548..5cbaca58af95 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -115,7 +115,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
* denote it as spurious which is no harm as this is a rare event
* and interrupt handlers have to cope with spurious interrupts
* anyway. If the vector is unused, then it is marked so it won't
- * trigger the 'No irq handler for vector' warning in do_IRQ().
+ * trigger the 'No irq handler for vector' warning in
+ * common_interrupt().
*
* This requires to hold vector lock to prevent concurrent updates to
* the affected vector.
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 67768e54438b..c48be6e1f676 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -861,13 +861,13 @@ static void free_moved_vector(struct apic_chip_data *apicd)
apicd->move_in_progress = 0;
}
-asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
{
struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
struct apic_chip_data *apicd;
struct hlist_node *tmp;
- entering_ack_irq();
+ ack_APIC_irq();
/* Prevent vectors vanishing under us */
raw_spin_lock(&vector_lock);
@@ -892,7 +892,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
}
raw_spin_unlock(&vector_lock);
- exiting_irq();
}
static void __send_cleanup_vector(struct apic_chip_data *apicd)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index c2a47016f243..828be792231e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -57,9 +57,6 @@ int main(void)
BLANK();
#undef ENTRY
- OFFSET(TSS_ist, tss_struct, x86_tss.ist);
- DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
- offsetof(struct cea_exception_stacks, DB1_stack));
BLANK();
#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7dc4ad68eb41..dba6a83bc349 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,6 +13,9 @@ endif
KCOV_INSTRUMENT_common.o := n
KCOV_INSTRUMENT_perf_event.o := n
+# As above, instrumenting secondary CPU boot code causes boot hangs.
+KCSAN_SANITIZE_common.o := n
+
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 676022e71791..1da9b1c9a2db 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -10,10 +10,10 @@
*/
#include <linux/interrupt.h>
-#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
static uint32_t __init acrn_detect(void)
@@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void)
static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
}
static bool acrn_x2apic_available(void)
@@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void)
static void (*acrn_intr_handler)(void);
-__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
acrn_intr_handler();
- exiting_irq();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ed54b3b21c39..0b71970d2d3d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -15,6 +15,7 @@
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/smt.h>
+#include <linux/pgtable.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -26,7 +27,6 @@
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
-#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
@@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
static void __init mds_print_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init srbds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -108,6 +109,7 @@ void __init check_bugs(void)
l1tf_select_mitigation();
mds_select_mitigation();
taa_select_mitigation();
+ srbds_select_mitigation();
/*
* As MDS and TAA mitigations are inter-related, print MDS
@@ -398,6 +400,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting
+ * TSX that are only exposed to SRBDS when TSX is enabled.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (cpu_mitigations_off() || srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ update_srbds_msr();
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -495,7 +588,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
-static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
#ifdef CONFIG_RETPOLINE
@@ -641,15 +736,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
break;
}
- /*
- * At this point, an STIBP mode other than "off" has been set.
- * If STIBP support is not being forced, check if STIBP always-on
- * is preferred.
- */
- if (mode != SPECTRE_V2_USER_STRICT &&
- boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
@@ -672,23 +758,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
+
+ spectre_v2_user_ibpb = mode;
}
- /* If enhanced IBRS is enabled no STIBP required */
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ /*
+ * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * required.
+ */
+ if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
- * If SMT is not possible or STIBP is not available clear the STIBP
- * mode.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if (mode != SPECTRE_V2_USER_STRICT &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ /*
+ * If STIBP is not available, clear the STIBP mode.
*/
- if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
mode = SPECTRE_V2_USER_NONE;
+
+ spectre_v2_user_stibp = mode;
+
set_mode:
- spectre_v2_user = mode;
- /* Only print the STIBP mode when SMT possible */
- if (smt_possible)
- pr_info("%s\n", spectre_v2_user_strings[mode]);
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
}
static const char * const spectre_v2_strings[] = {
@@ -921,7 +1020,7 @@ void cpu_bugs_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
case SPECTRE_V2_USER_STRICT:
@@ -1164,14 +1263,19 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
/*
* Indirect branch speculation is always disabled in strict
- * mode.
+ * mode. It can neither be enabled if it was force-disabled
+ * by a previous prctl call.
+
*/
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ task_spec_ib_force_disable(task))
return -EPERM;
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
@@ -1182,10 +1286,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
* Indirect branch speculation is always allowed when
* mitigation is force disabled.
*/
- if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
return 0;
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
@@ -1216,7 +1322,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
- if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -1247,22 +1354,24 @@ static int ib_prctl_get(struct task_struct *task)
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return PR_SPEC_NOT_AFFECTED;
- switch (spectre_v2_user) {
- case SPECTRE_V2_USER_NONE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_PRCTL:
- case SPECTRE_V2_USER_SECCOMP:
+ else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- case SPECTRE_V2_USER_STRICT:
- case SPECTRE_V2_USER_STRICT_PREFERRED:
- return PR_SPEC_DISABLE;
- default:
+ } else
return PR_SPEC_NOT_AFFECTED;
- }
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
@@ -1501,7 +1610,7 @@ static char *stibp_state(void)
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return "";
- switch (spectre_v2_user) {
+ switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
return ", STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
@@ -1528,6 +1637,11 @@ static char *ibpb_state(void)
return "";
}
+static ssize_t srbds_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -1572,6 +1686,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_ITLB_MULTIHIT:
return itlb_multihit_show_state(buf);
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
default:
break;
}
@@ -1618,4 +1735,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr
{
return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 74682b8d09b0..043d93cdcaad 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,6 +21,7 @@
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
+#include <linux/pgtable.h>
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
@@ -35,7 +36,6 @@
#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
-#include <asm/pgtable.h>
#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -1073,9 +1073,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
{}
};
-static bool __init cpu_matches(unsigned long which)
+#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
+ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, steppings, \
+ X86_FEATURE_ANY, issues)
+
+#define SRBDS BIT(0)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
{
- const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
+ const struct x86_cpu_id *m = x86_match_cpu(table);
return m && !!(m->driver_data & which);
}
@@ -1095,31 +1116,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
u64 ia32_cap = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
- if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
- if (cpu_matches(NO_SPECULATION))
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
- if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
- if (cpu_matches(MSBDS_ONLY))
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
}
- if (!cpu_matches(NO_SWAPGS))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
/*
@@ -1137,7 +1161,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
- if (cpu_matches(NO_MELTDOWN))
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -1146,7 +1179,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
- if (cpu_matches(NO_L1TF))
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
@@ -1574,6 +1607,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
}
static __init int setup_noclflush(char *arg)
@@ -1672,25 +1706,6 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
-DEFINE_PER_CPU(int, debug_stack_usage);
-DEFINE_PER_CPU(u32, debug_idt_ctr);
-
-void debug_stack_set_zero(void)
-{
- this_cpu_inc(debug_idt_ctr);
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_set_zero);
-
-void debug_stack_reset(void)
-{
- if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
- return;
- if (this_cpu_dec_return(debug_idt_ctr) == 0)
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_reset);
-
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 37fdefd14f28..fb538fccd24c 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 166d7c355896..c25a67a34bd3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/pgtable.h>
#include <linux/string.h>
#include <linux/bitops.h>
@@ -11,7 +12,6 @@
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
-#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -1142,9 +1142,12 @@ void switch_to_sld(unsigned long tifn)
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
{}
};
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 52de616a8065..99be063fcb1b 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -192,7 +192,12 @@ EXPORT_SYMBOL_GPL(smca_banks);
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(unsigned int, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
@@ -381,6 +386,10 @@ static void threshold_restart_bank(void *_tr)
struct thresh_restart *tr = _tr;
u32 hi, lo;
+ /* sysfs write might race against an offline operation */
+ if (this_cpu_read(threshold_banks))
+ return;
+
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
@@ -568,14 +577,19 @@ bool amd_filter_mce(struct mce *m)
{
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
- u8 xec = (m->status >> 16) & 0x3F;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
if (c->x86 == 0x17 &&
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
- bank_type == SMCA_IF && xec == 10)
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
return true;
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
return false;
}
@@ -907,14 +921,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
mce_log(&m);
}
-asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
- entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/*
@@ -1016,13 +1029,22 @@ static void log_and_reset_block(struct threshold_block *block)
static void amd_threshold_interrupt(void)
{
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank, cpu = smp_processor_id();
+ /*
+ * Validate that the threshold bank has been initialized already. The
+ * handler is installed at boot time, but on a hotplug event the
+ * interrupt might fire before the data has been initialized.
+ */
+ if (!bp)
+ return;
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
- first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ first_block = bp[bank]->blocks;
if (!first_block)
continue;
@@ -1071,7 +1093,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
memset(&tr, 0, sizeof(tr));
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1095,7 +1118,8 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
b->threshold_limit = new;
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1104,7 +1128,9 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 lo, hi;
- rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
@@ -1209,10 +1235,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
u32 low, high;
int err;
- if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ if (rdmsr_safe(address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -1247,6 +1273,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
INIT_LIST_HEAD(&b->miscj);
+ /* This is safe as @tb is not visible yet */
if (tb->blocks)
list_add(&b->miscj, &tb->blocks->miscj);
else
@@ -1267,13 +1294,12 @@ recurse:
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
- return err;
+ return 0;
out_free:
if (b) {
- kobject_put(&b->kobj);
list_del(&b->miscj);
- kfree(b);
+ kobject_put(&b->kobj);
}
return err;
}
@@ -1302,9 +1328,10 @@ static int __threshold_add_blocks(struct threshold_bank *b)
return err;
}
-static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
{
- struct device *dev = per_cpu(mce_device, cpu);
+ struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(bank, NULL);
@@ -1324,7 +1351,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;
- per_cpu(threshold_banks, cpu)[bank] = b;
+ bp[bank] = b;
refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1339,6 +1366,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
+ /* Associate the bank with the per-CPU MCE device */
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
@@ -1346,6 +1374,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
if (is_shared_bank(bank)) {
+ b->shared = 1;
refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
@@ -1357,16 +1386,16 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
if (err)
- goto out_free;
-
- per_cpu(threshold_banks, cpu)[bank] = b;
+ goto out_kobj;
+ bp[bank] = b;
return 0;
- out_free:
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
kfree(b);
-
- out:
+out:
return err;
}
@@ -1375,21 +1404,16 @@ static void threshold_block_release(struct kobject *kobj)
kfree(to_block(kobj));
}
-static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
+static void deallocate_threshold_blocks(struct threshold_bank *bank)
{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+ struct threshold_block *pos, *tmp;
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) {
list_del(&pos->miscj);
kobject_put(&pos->kobj);
}
- kobject_put(&head->blocks->kobj);
+ kobject_put(&bank->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
@@ -1403,122 +1427,102 @@ static void __threshold_remove_blocks(struct threshold_bank *b)
kobject_del(&pos->kobj);
}
-static void threshold_remove_bank(unsigned int cpu, int bank)
+static void threshold_remove_bank(struct threshold_bank *bank)
{
struct amd_northbridge *nb;
- struct threshold_bank *b;
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
- return;
+ if (!bank->blocks)
+ goto out_free;
- if (!b->blocks)
- goto free_out;
+ if (!bank->shared)
+ goto out_dealloc;
- if (is_shared_bank(bank)) {
- if (!refcount_dec_and_test(&b->cpus)) {
- __threshold_remove_blocks(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- return;
- } else {
- /*
- * the last CPU on this node using the shared bank is
- * going away, remove that bank now.
- */
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
- nb->bank4 = NULL;
- }
+ if (!refcount_dec_and_test(&bank->cpus)) {
+ __threshold_remove_blocks(bank);
+ return;
+ } else {
+ /*
+ * The last CPU on this node using the shared bank is going
+ * away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb->bank4 = NULL;
}
- deallocate_threshold_block(cpu, bank);
+out_dealloc:
+ deallocate_threshold_blocks(bank);
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
+out_free:
+ kobject_put(bank->kobj);
+ kfree(bank);
}
int mce_threshold_remove_device(unsigned int cpu)
{
- unsigned int bank;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
+ if (!bp)
+ return 0;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (bp[bank]) {
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
}
- kfree(per_cpu(threshold_banks, cpu));
- per_cpu(threshold_banks, cpu) = NULL;
+ kfree(bp);
return 0;
}
-/* create dir/files for all valid threshold banks */
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
int mce_threshold_create_device(unsigned int cpu)
{
- unsigned int bank;
+ unsigned int numbanks, bank;
struct threshold_bank **bp;
- int err = 0;
+ int err;
- bp = per_cpu(threshold_banks, cpu);
+ if (!mce_flags.amd_threshold)
+ return 0;
+
+ bp = this_cpu_read(threshold_banks);
if (bp)
return 0;
- bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
- GFP_KERNEL);
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
if (!bp)
return -ENOMEM;
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
- err = threshold_create_bank(cpu, bank);
+ err = threshold_create_bank(bp, cpu, bank);
if (err)
- goto err;
- }
- return err;
-err:
- mce_threshold_remove_device(cpu);
- return err;
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = mce_threshold_create_device(lcpu);
-
- if (err)
- return err;
+ goto out_err;
}
+ this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
-
return 0;
+out_err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e9265e2f28c9..ce9120c4f740 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -130,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+noinstr void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -140,12 +140,12 @@ void mce_setup(struct mce *m)
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
- rdmsrl(MSR_PPIN, m->ppin);
+ m->ppin = __rdmsr(MSR_PPIN);
else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
- rdmsrl(MSR_AMD_PPIN, m->ppin);
+ m->ppin = __rdmsr(MSR_AMD_PPIN);
m->microcode = boot_cpu_data.microcode;
}
@@ -160,29 +160,17 @@ void mce_log(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_log);
-/*
- * We run the default notifier if we have only the UC, the first and the
- * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
- * notifiers registered on the chain.
- */
-#define NUM_DEFAULT_NOTIFIERS 3
-static atomic_t num_notifiers;
-
void mce_register_decode_chain(struct notifier_block *nb)
{
if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
return;
- atomic_inc(&num_notifiers);
-
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
- atomic_dec(&num_notifiers);
-
blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -265,6 +253,7 @@ static void __print_mce(struct mce *m)
}
pr_cont("\n");
+
/*
* Note this output is parsed by external tools and old fields
* should not be changed.
@@ -531,6 +520,14 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -546,22 +543,7 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
-static bool cec_add_mce(struct mce *m)
-{
- if (!m)
- return false;
-
- /* We eat only correctable DRAM errors with usable addresses. */
- if (mce_is_memory_error(m) &&
- mce_is_correctable(m) &&
- mce_usable_address(m))
- if (!cec_add_elem(m->addr >> PAGE_SHIFT))
- return true;
-
- return false;
-}
-
-static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
@@ -569,9 +551,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
- if (cec_add_mce(m))
- return NOTIFY_STOP;
-
/* Emit the trace record: */
trace_mce_record(m);
@@ -582,9 +561,9 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
return NOTIFY_DONE;
}
-static struct notifier_block first_nb = {
- .notifier_call = mce_first_notifier,
- .priority = MCE_PRIO_FIRST,
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
};
static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
@@ -601,8 +580,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
return NOTIFY_DONE;
pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn, whole_page(mce));
+ mce->kflags |= MCE_HANDLED_UC;
+ }
return NOTIFY_OK;
}
@@ -620,10 +601,8 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
- if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
- return NOTIFY_DONE;
-
- __print_mce(m);
+ if (mca_cfg.print_all || !m->kflags)
+ __print_mce(m);
return NOTIFY_DONE;
}
@@ -1100,13 +1079,15 @@ static void mce_clear_state(unsigned long *toclear)
* kdump kernel establishing a new #MC handler where a broadcasted MCE
* might not get handled properly.
*/
-static bool __mc_check_crashing_cpu(int cpu)
+static noinstr bool mce_check_crashing_cpu(void)
{
+ unsigned int cpu = smp_processor_id();
+
if (cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
- mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
@@ -1114,7 +1095,7 @@ static bool __mc_check_crashing_cpu(int cpu)
}
if (mcgstatus & MCG_STATUS_RIPV) {
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
return true;
}
}
@@ -1200,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
- if (!(p->mce_status & MCG_STATUS_RIPV))
+
+ if (!p->mce_ripv)
flags |= MF_MUST_KILL;
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
return;
}
@@ -1230,12 +1212,11 @@ static void kill_me_maybe(struct callback_head *cb)
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
-void noinstr do_machine_check(struct pt_regs *regs, long error_code)
+void noinstr do_machine_check(struct pt_regs *regs)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
- int cpu = smp_processor_id();
struct mce m, *final;
char *msg = NULL;
int worst = 0;
@@ -1264,11 +1245,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code)
*/
int lmce = 1;
- if (__mc_check_crashing_cpu(cpu))
- return;
-
- nmi_enter();
-
this_cpu_inc(mce_exception_count);
mce_gather_info(&m, regs);
@@ -1356,7 +1332,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code)
sync_core();
if (worst != MCE_AR_SEVERITY && !kill_it)
- goto out_ist;
+ return;
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
@@ -1364,18 +1340,27 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code)
BUG_ON(!on_thread_stack() || !user_mode(regs));
current->mce_addr = m.addr;
- current->mce_status = m.mcgstatus;
+ current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(&m);
current->mce_kill_me.func = kill_me_maybe;
if (kill_it)
current->mce_kill_me.func = kill_me_now;
task_work_add(current, &current->mce_kill_me, true);
} else {
- if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &m, msg);
+ }
}
-
-out_ist:
- nmi_exit();
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1765,6 +1750,7 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+ mce_flags.amd_threshold = 1;
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
@@ -1902,21 +1888,84 @@ bool filter_mce(struct mce *m)
}
/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
{
+ instrumentation_begin();
pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
+ instrumentation_end();
}
/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- machine_check_vector(regs, error_code);
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (machine_check_vector == do_machine_check &&
+ mce_check_crashing_cpu())
+ return;
+
+ nmi_enter();
+ /*
+ * The call targets are marked noinstr, but objtool can't figure
+ * that out because it's an indirect call. Annotate it.
+ */
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ machine_check_vector(regs);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ nmi_exit();
+}
+
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+ machine_check_vector(regs);
+ instrumentation_end();
+ idtentry_exit_user(regs);
}
-NOKPROBE_SYMBOL(do_mce);
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
/*
* Called for each booted CPU to set up machine checks.
@@ -1999,6 +2048,7 @@ void mce_disable_bank(int bank)
* mce=no_cmci Disables CMCI
* mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine
@@ -2027,6 +2077,8 @@ static int __init mcheck_enable(char *str)
cfg->lmce_disabled = 1;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
else if (!strcmp(str, "ignore_ce"))
cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
@@ -2049,7 +2101,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
- mce_register_decode_chain(&first_nb);
+ mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2293,6 +2345,7 @@ static ssize_t store_int_with_restart(struct device *s,
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
static struct dev_ext_attribute dev_attr_check_interval = {
__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -2317,6 +2370,7 @@ static struct device_attribute *mce_device_attrs[] = {
#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
NULL
@@ -2489,6 +2543,13 @@ static __init void mce_init_banks(void)
}
}
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
static __init int mcheck_init_device(void)
{
int err;
@@ -2520,6 +2581,10 @@ static __init int mcheck_init_device(void)
if (err)
goto err_out_mem;
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
@@ -2609,7 +2674,6 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
- cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index d089567a9ce8..43c466020ed5 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -39,6 +39,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
struct mce *mce = (struct mce *)data;
unsigned int entry;
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
mutex_lock(&mce_chrdev_read_mutex);
entry = mcelog->next;
@@ -56,6 +59,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -63,6 +67,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
unlock:
mutex_unlock(&mce_chrdev_read_mutex);
+ mce->kflags |= MCE_HANDLED_MCELOG;
return NOTIFY_OK;
}
@@ -324,6 +329,7 @@ static const struct file_operations mce_chrdev_ops = {
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = no_llseek,
};
@@ -343,7 +349,7 @@ static __init int dev_mcelog_init_device(void)
if (!mcelog)
return -ENOMEM;
- strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
mcelog->len = mce_log_len;
mcelog->recordlen = sizeof(struct mce);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 3413b41b8d55..0593b192eb8f 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
regs.cs = m->cs;
pregs = &regs;
}
- /* in mcheck exeception handler, irq will be disabled */
+ /* do_machine_check() expects interrupts disabled -- at least */
local_irq_save(flags);
- do_machine_check(pregs, 0);
+ do_machine_check(pregs);
local_irq_restore(flags);
m->finished = 0;
}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 3b008172ad73..6473070b5da4 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -9,7 +9,7 @@
#include <asm/mce.h>
/* Pointer to the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+extern void (*machine_check_vector)(struct pt_regs *);
enum severity_level {
MCE_NO_SEVERITY,
@@ -119,6 +119,7 @@ struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
bool ignore_ce;
+ bool print_all;
__u64 lmce_disabled : 1,
disabled : 1,
@@ -148,7 +149,7 @@ struct mce_vendor_flags {
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
- succor : 1,
+ succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
@@ -156,9 +157,12 @@ struct mce_vendor_flags {
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
- smca : 1,
+ smca : 1,
- __reserved_0 : 61;
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ __reserved_0 : 60;
};
extern struct mce_vendor_flags mce_flags;
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 5ee94aa1b766..19e90cae8e97 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -21,12 +21,11 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
- nmi_enter();
-
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,8 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- nmi_exit();
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 87bcdc6dc2f0..e1da619add19 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -213,8 +213,12 @@ static int error_context(struct mce *m)
{
if ((m->cs & 3) == 3)
return IN_USER;
- if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+
+ if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) {
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+ }
+
return IN_KERNEL;
}
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index f36dc0742085..a7cd2d203ced 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
- entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 28812cc15300..6a059a035021 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -21,12 +21,11 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
{
- entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index b3938c195365..9c9f0abd2d7f 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -17,14 +17,12 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void winchip_machine_check(struct pt_regs *regs)
{
- nmi_enter();
-
+ instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- nmi_exit();
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ebf34c7bc8bc..af94f05a5c66 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -23,6 +23,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
@@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
static void (*hv_crash_handler)(struct pt_regs *regs);
-__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
@@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
* Routines to do per-architecture handling of stimer0
* interrupts when in Direct Mode
*/
-
-__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -331,17 +327,19 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
/* Setup the IDT for reenlightenment notifications */
- if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) {
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_vector);
+ asm_sysvec_hyperv_reenlightenment);
+ }
/* Setup the IDT for stimer0 */
- if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
- hv_stimer0_callback_vector);
+ asm_sysvec_hyperv_stimer0);
+ }
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 4bd28b388a1a..0daf2f1cf7a8 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1326,9 +1326,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
* pseudo-locked region will still be here on return.
*
* The mutex has to be released temporarily to avoid a potential
- * deadlock with the mm->mmap_sem semaphore which is obtained in
- * the device_create() and debugfs_create_dir() callpath below
- * as well as before the mmap() callback is called.
+ * deadlock with the mm->mmap_lock which is obtained in the
+ * device_create() and debugfs_create_dir() callpath below as well as
+ * before the mmap() callback is called.
*/
mutex_unlock(&rdtgroup_mutex);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index d7cb5ab0d1f0..23b4b61319d3 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -3199,10 +3199,10 @@ int __init rdtgroup_init(void)
* during the debugfs directory creation also &sb->s_type->i_mutex_key
* (the lockdep class of inode->i_rwsem). Other filesystem
* interactions (eg. SyS_getdents) have the lock ordering:
- * &sb->s_type->i_mutex_key --> &mm->mmap_sem
- * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
+ * &sb->s_type->i_mutex_key --> &mm->mmap_lock
+ * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
* is taken, thus creating dependency:
- * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
+ * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
* issues considering the other two lock dependencies.
* By creating the debugfs directory here we avoid a dependency
* that may cause deadlock (even though file operations cannot
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c
index c0159a7bca6d..8a89c109e20a 100644
--- a/arch/x86/kernel/crash_core_32.c
+++ b/arch/x86/kernel/crash_core_32.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/crash_core.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c
index 845a57eb4eb7..7d255f882afe 100644
--- a/arch/x86/kernel/crash_core_64.c
+++ b/arch/x86/kernel/crash_core_64.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/crash_core.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 3793646f0fb5..759d392cbe9f 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -6,12 +6,10 @@
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/traps.h>
-extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
@@ -22,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu);
* Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
* we're running the doublefault task. Cannot return.
*/
-asmlinkage notrace void __noreturn doublefault_shim(void)
+asmlinkage noinstr void __noreturn doublefault_shim(void)
{
unsigned long cr2;
struct pt_regs regs;
@@ -41,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
* Fill in pt_regs. A downside of doing this in C is that the unwinder
* won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
* won't successfully unwind to the source of the double fault.
- * The main dump from do_double_fault() is fine, though, since it
+ * The main dump from exc_double_fault() is fine, though, since it
* uses these regs directly.
*
* If anyone ever cares, this could be moved to asm.
@@ -71,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
regs.cx = TSS(cx);
regs.bx = TSS(bx);
- do_double_fault(&regs, 0, cr2);
+ exc_double_fault(&regs, 0, cr2);
/*
* x86_32 does not save the original CR3 anywhere on a task switch.
@@ -85,7 +83,6 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
*/
panic("cannot return from double fault\n");
}
-NOKPROBE_SYMBOL(doublefault_shim);
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = {
@@ -96,7 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
- .ip = (unsigned long) double_fault,
+ .ip = (unsigned long) asm_exc_double_fault,
.flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae64ec7f752f..456511b2284e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info)
}
static void printk_stack_address(unsigned long address, int reliable,
- char *log_lvl)
+ const char *log_lvl)
{
touch_nmi_watchdog();
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
@@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, char *log_lvl)
+ unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
struct stack_info stack_info = {0};
@@ -279,7 +279,8 @@ next:
}
}
-void show_stack(struct task_struct *task, unsigned long *sp)
+void show_stack(struct task_struct *task, unsigned long *sp,
+ const char *loglvl)
{
task = task ? : current;
@@ -290,7 +291,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
if (!sp && task == current)
sp = get_stack_pointer(current, NULL);
- show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
+ show_trace_log_lvl(task, NULL, sp, loglvl);
}
void show_stack_regs(struct pt_regs *regs)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 460ae7f66818..4a94d38cd141 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -22,15 +22,13 @@
static const char * const exception_stack_names[] = {
[ ESTACK_DF ] = "#DF",
[ ESTACK_NMI ] = "NMI",
- [ ESTACK_DB2 ] = "#DB2",
- [ ESTACK_DB1 ] = "#DB1",
[ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC",
};
const char *stack_type_name(enum stack_type type)
{
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
if (type == STACK_TYPE_IRQ)
return "IRQ";
@@ -79,7 +77,6 @@ static const
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(DF),
EPAGERANGE(NMI),
- EPAGERANGE(DB1),
EPAGERANGE(DB),
EPAGERANGE(MCE),
};
@@ -91,7 +88,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
struct pt_regs *regs;
unsigned int k;
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4d13c57f370a..983cd53ed4c9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -991,7 +991,15 @@ void __init e820__reserve_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+
+ /*
+ * SETUP_EFI is supplied by kexec and does not need to be
+ * reserved.
+ */
+ if (data->type != SETUP_EFI)
+ e820__range_update_kexec(pa_data,
+ sizeof(*data) + data->len,
+ E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
if (data->type == SETUP_INDIRECT &&
((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) {
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 93fbdff2974f..d3c531d3b244 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -8,6 +8,7 @@
#include <linux/pci_regs.h>
#include <linux/pci_ids.h>
#include <linux/errno.h>
+#include <linux/pgtable.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
@@ -15,7 +16,6 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
-#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
#include <linux/usb/xhci-dbgp.h>
#include <asm/pci_x86.h>
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 12e7d4406c32..4fe7af58cfe1 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -29,7 +29,7 @@
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/random.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index aa5d28aeb31e..083a3da7bb73 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -12,7 +12,7 @@
#include <asm/frame.h>
.code64
- .section .entry.text, "ax"
+ .section .text, "ax"
#ifdef CONFIG_FRAME_POINTER
/* Save parent and function stack frames (rip and rbp) */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 206a4b6144c2..cbb71c1b574f 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -20,13 +20,13 @@
#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/mem_encrypt.h>
+#include <linux/pgtable.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 4bbc770af632..16da4ac01597 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -13,8 +13,8 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
-#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
@@ -29,15 +29,16 @@
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
#else
#define INTERRUPT_RETURN iretq
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
#endif
-/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
+/*
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
- *
*/
-
#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 4d8d53ed02c9..8cdf29ffd95f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,8 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/user.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
@@ -97,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
unsigned long *dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -115,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
+ /*
+ * Ensure we first write cpu_dr7 before we set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
set_debugreg(*dr7, 7);
if (info->mask)
set_dr_addr_mask(info->mask, i);
@@ -134,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- unsigned long *dr7;
+ unsigned long dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -149,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
- dr7 = this_cpu_ptr(&cpu_dr7);
- *dr7 &= ~__encode_dr7(i, info->len, info->type);
+ dr7 = this_cpu_read(cpu_dr7);
+ dr7 &= ~__encode_dr7(i, info->len, info->type);
- set_debugreg(*dr7, 7);
+ set_debugreg(dr7, 7);
if (info->mask)
set_dr_addr_mask(0, i);
+
+ /*
+ * Ensure the write to cpu_dr7 is after we've set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
+ this_cpu_write(cpu_dr7, dr7);
}
static int arch_bp_generic_len(int x86_len)
@@ -227,10 +247,76 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
}
+/*
+ * Checks whether the range [addr, end], overlaps the area [base, base + size).
+ */
+static inline bool within_area(unsigned long addr, unsigned long end,
+ unsigned long base, unsigned long size)
+{
+ return end >= base && addr < (base + size);
+}
+
+/*
+ * Checks whether the range from addr to end, inclusive, overlaps the fixed
+ * mapped CPU entry area range or other ranges used for CPU entry.
+ */
+static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
+{
+ int cpu;
+
+ /* CPU entry erea is always used for CPU entry */
+ if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_TOTAL_SIZE))
+ return true;
+
+ for_each_possible_cpu(cpu) {
+ /* The original rw GDT is being used after load_direct_gdt() */
+ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
+ GDT_SIZE))
+ return true;
+
+ /*
+ * cpu_tss_rw is not directly referenced by hardware, but
+ * cpu_tss_rw is also used in CPU entry code,
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct)))
+ return true;
+
+ /*
+ * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
+ * If a data breakpoint on it, it will cause an unwanted #DB.
+ * Protect the full cpu_tlbstate structure to be sure.
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tlbstate, cpu),
+ sizeof(struct tlb_state)))
+ return true;
+ }
+
+ return false;
+}
+
static int arch_build_bp_info(struct perf_event *bp,
const struct perf_event_attr *attr,
struct arch_hw_breakpoint *hw)
{
+ unsigned long bp_end;
+
+ bp_end = attr->bp_addr + attr->bp_len - 1;
+ if (bp_end < attr->bp_addr)
+ return -EINVAL;
+
+ /*
+ * Prevent any breakpoint of any type that overlaps the CPU
+ * entry area and data. This protects the IST stacks and also
+ * reduces the chance that we ever find out what happens if
+ * there's a data breakpoint on the GDT, IDT, or TSS.
+ */
+ if (within_cpu_entry(attr->bp_addr, bp_end))
+ return -EINVAL;
+
hw->address = attr->bp_addr;
hw->mask = 0;
@@ -439,7 +525,7 @@ static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
- unsigned long dr7, dr6;
+ unsigned long dr6;
unsigned long *dr6_p;
/* The DR6 value is pointed by args->err */
@@ -454,9 +540,6 @@ static int hw_breakpoint_handler(struct die_args *args)
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
- get_debugreg(dr7, 7);
- /* Disable breakpoints during exception handling */
- set_debugreg(0UL, 7);
/*
* Assert that local interrupts are disabled
* Reset the DRn bits in the virtualized register value.
@@ -513,7 +596,6 @@ static int hw_breakpoint_handler(struct die_args *args)
(dr6 & (~DR_TRAP_BITS)))
rc = NOTIFY_DONE;
- set_debugreg(dr7, 7);
put_cpu();
return rc;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 519649ddf100..f3c76252247d 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,11 +15,11 @@
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 87ef69a72c52..0db21206f2f3 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -4,6 +4,8 @@
*/
#include <linux/interrupt.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
@@ -51,15 +53,23 @@ struct idt_data {
#define TSKG(_vector, _gdt) \
G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
+
+static bool idt_setup_done __initdata;
+
/*
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initconst struct idt_data early_idts[] = {
- INTG(X86_TRAP_DB, debug),
- SYSG(X86_TRAP_BP, int3),
+ INTG(X86_TRAP_DB, asm_exc_debug),
+ SYSG(X86_TRAP_BP, asm_exc_int3),
+
#ifdef CONFIG_X86_32
- INTG(X86_TRAP_PF, page_fault),
+ /*
+ * Not possible on 64-bit. See idt_setup_early_pf() for details.
+ */
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
};
@@ -70,33 +80,33 @@ static const __initconst struct idt_data early_idts[] = {
* set up TSS.
*/
static const __initconst struct idt_data def_idts[] = {
- INTG(X86_TRAP_DE, divide_error),
- INTG(X86_TRAP_NMI, nmi),
- INTG(X86_TRAP_BR, bounds),
- INTG(X86_TRAP_UD, invalid_op),
- INTG(X86_TRAP_NM, device_not_available),
- INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
- INTG(X86_TRAP_TS, invalid_TSS),
- INTG(X86_TRAP_NP, segment_not_present),
- INTG(X86_TRAP_SS, stack_segment),
- INTG(X86_TRAP_GP, general_protection),
- INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
- INTG(X86_TRAP_MF, coprocessor_error),
- INTG(X86_TRAP_AC, alignment_check),
- INTG(X86_TRAP_XF, simd_coprocessor_error),
+ INTG(X86_TRAP_DE, asm_exc_divide_error),
+ INTG(X86_TRAP_NMI, asm_exc_nmi),
+ INTG(X86_TRAP_BR, asm_exc_bounds),
+ INTG(X86_TRAP_UD, asm_exc_invalid_op),
+ INTG(X86_TRAP_NM, asm_exc_device_not_available),
+ INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
+ INTG(X86_TRAP_TS, asm_exc_invalid_tss),
+ INTG(X86_TRAP_NP, asm_exc_segment_not_present),
+ INTG(X86_TRAP_SS, asm_exc_stack_segment),
+ INTG(X86_TRAP_GP, asm_exc_general_protection),
+ INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
+ INTG(X86_TRAP_AC, asm_exc_alignment_check),
+ INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
- INTG(X86_TRAP_DF, double_fault),
+ INTG(X86_TRAP_DF, asm_exc_double_fault),
#endif
- INTG(X86_TRAP_DB, debug),
+ INTG(X86_TRAP_DB, asm_exc_debug),
#ifdef CONFIG_X86_MCE
- INTG(X86_TRAP_MC, &machine_check),
+ INTG(X86_TRAP_MC, asm_exc_machine_check),
#endif
- SYSG(X86_TRAP_OF, overflow),
+ SYSG(X86_TRAP_OF, asm_exc_overflow),
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
@@ -109,95 +119,63 @@ static const __initconst struct idt_data def_idts[] = {
*/
static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_SMP
- INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
- INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
- INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
- INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
- INTG(REBOOT_VECTOR, reboot_interrupt),
+ INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
+ INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup),
+ INTG(REBOOT_VECTOR, asm_sysvec_reboot),
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
- INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+ INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
- INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+ INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
#endif
#ifdef CONFIG_X86_MCE_AMD
- INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+ INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
- INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+ INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
# ifdef CONFIG_HAVE_KVM
- INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
- INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
- INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+ INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
# endif
# ifdef CONFIG_IRQ_WORK
- INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+ INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
# endif
-#ifdef CONFIG_X86_UV
- INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
-#endif
- INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
- INTG(ERROR_APIC_VECTOR, error_interrupt),
+# ifdef CONFIG_X86_UV
+ INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
+ INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
#endif
};
-#ifdef CONFIG_X86_64
-/*
- * Early traps running on the DEFAULT_STACK because the other interrupt
- * stacks work only after cpu_init().
- */
-static const __initconst struct idt_data early_pf_idts[] = {
- INTG(X86_TRAP_PF, page_fault),
-};
-
-/*
- * Override for the debug_idt. Same as the default, but with interrupt
- * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
- */
-static const __initconst struct idt_data dbg_idts[] = {
- INTG(X86_TRAP_DB, debug),
-};
-#endif
-
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+/* Must be page-aligned because the real IDT is used in the cpu entry area */
+static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
struct desc_ptr idt_descr __ro_after_init = {
- .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
-#ifdef CONFIG_X86_64
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
-
-/*
- * The exceptions which use Interrupt stacks. They are setup after
- * cpu_init() when the TSS has been initialized.
- */
-static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
-#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
-#endif
-};
+void load_current_idt(void)
+{
+ lockdep_assert_irqs_disabled();
+ load_idt(&idt_descr);
+}
-/*
- * Override for the debug_idt. Same as the default, but with interrupt
- * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
- */
-const struct desc_ptr debug_idt_descr = {
- .size = IDT_ENTRIES * 16 - 1,
- .address = (unsigned long) debug_idt_table,
-};
+#ifdef CONFIG_X86_F00F_BUG
+bool idt_is_f00f_address(unsigned long address)
+{
+ return ((address - idt_descr.address) >> 3) == 6;
+}
#endif
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
@@ -214,7 +192,7 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
#endif
}
-static void
+static __init void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
gate_desc desc;
@@ -227,7 +205,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy
}
}
-static void set_intr_gate(unsigned int n, const void *addr)
+static __init void set_intr_gate(unsigned int n, const void *addr)
{
struct idt_data data;
@@ -266,6 +244,27 @@ void __init idt_setup_traps(void)
}
#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+};
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initconst struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+};
+
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -273,8 +272,10 @@ void __init idt_setup_traps(void)
* cpu_init() is invoked and sets up TSS. The IST variant is installed
* after that.
*
- * FIXME: Why is 32bit and 64bit installing the PF handler at different
- * places in the early setup code?
+ * Note, that X86_64 cannot install the real #PF handler in
+ * idt_setup_early_traps() because the memory intialization needs the #PF
+ * handler from the early_idt_handler_array to initialize the early page
+ * tables.
*/
void __init idt_setup_early_pf(void)
{
@@ -289,17 +290,20 @@ void __init idt_setup_ist_traps(void)
{
idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
}
+#endif
-/**
- * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
- */
-void __init idt_setup_debugidt_traps(void)
+static void __init idt_map_in_cea(void)
{
- memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
-
- idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+ /*
+ * Set the IDT descriptor to a fixed read-only location in the cpu
+ * entry area, so that the "sidt" instruction will not leak the
+ * location of the kernel, and to defend the IDT against arbitrary
+ * memory write vulnerabilities.
+ */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
}
-#endif
/**
* idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
@@ -318,11 +322,23 @@ void __init idt_setup_apic_and_irq_gates(void)
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
- set_bit(i, system_vectors);
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
#endif
+ /* Map IDT into CPU entry area and reload it. */
+ idt_map_in_cea();
+ load_idt(&idt_descr);
+
+ /* Make the IDT table read only */
+ set_memory_ro((unsigned long)&idt_table, 1);
+
+ idt_setup_done = true;
}
/**
@@ -352,16 +368,14 @@ void idt_invalidate(void *addr)
load_idt(&idt);
}
-void __init update_intr_gate(unsigned int n, const void *addr)
+void __init alloc_intr_gate(unsigned int n, const void *addr)
{
- if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
+ if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
return;
- set_intr_gate(n, addr);
-}
-void alloc_intr_gate(unsigned int n, const void *addr)
-{
- BUG_ON(n < FIRST_SYSTEM_VECTOR);
- if (!test_and_set_bit(n, system_vectors))
+ if (WARN_ON(idt_setup_done))
+ return;
+
+ if (!WARN_ON(test_and_set_bit(n, system_vectors)))
set_intr_gate(n, addr);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c7965ff429c5..181060247e3c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -13,12 +13,14 @@
#include <linux/export.h>
#include <linux/irq.h>
+#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
+#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -26,9 +28,6 @@
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
atomic_t irq_err_count;
/*
@@ -224,35 +223,35 @@ u64 arch_irq_stat(void)
return sum;
}
+static __always_inline void handle_irq(struct irq_desc *desc,
+ struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_X86_64))
+ run_on_irqstack_cond(desc->handle_irq, desc, regs);
+ else
+ __handle_irq(desc, regs);
+}
/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
*/
-__visible void __irq_entry do_IRQ(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc * desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
-
- entering_irq();
+ struct irq_desc *desc;
- /* entering_irq() tells RCU that we're not quiescent. Check it. */
+ /* entry code tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
- if (IS_ENABLED(CONFIG_X86_32))
- handle_irq(desc, regs);
- else
- generic_handle_irq_desc(desc);
+ handle_irq(desc, regs);
} else {
ack_APIC_irq();
if (desc == VECTOR_UNUSED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
@@ -260,8 +259,6 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs)
}
}
- exiting_irq();
-
set_irq_regs(old_regs);
}
@@ -271,17 +268,16 @@ void (*x86_platform_ipi_callback)(void) = NULL;
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
-__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_ack_irq();
+ ack_APIC_irq();
trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
x86_platform_ipi_callback();
trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
- exiting_irq();
set_irq_regs(old_regs);
}
#endif
@@ -302,41 +298,29 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
-__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_ipis);
- exiting_irq();
- set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
-__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_wakeup_ipis);
kvm_posted_intr_wakeup_handler();
- exiting_irq();
- set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_NESTED_VECTOR.
*/
-__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_nested_ipis);
- exiting_irq();
- set_irq_regs(old_regs);
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a759ca97cd01..0b79efc87be5 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,7 +148,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
-void handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
int overflow = check_stack_overflow();
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b32ab009c19..1b4fe93a86c5 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
+#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -70,3 +71,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
}
+
+void do_softirq_own_stack(void)
+{
+ run_on_irqstack_cond(__do_softirq, NULL, NULL);
+}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 80bee7695a20..890d4778cd35 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -9,18 +9,18 @@
#include <linux/irq_work.h>
#include <linux/hardirq.h>
#include <asm/apic.h>
+#include <asm/idtentry.h>
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>
#ifdef CONFIG_X86_LOCAL_APIC
-__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
inc_irq_stat(apic_irq_work_irqs);
irq_work_run();
trace_irq_work_exit(IRQ_WORK_VECTOR);
- exiting_irq();
}
void arch_irq_work_raise(void)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 5aa523c2d573..dd73135d7cee 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,11 +16,11 @@
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4d7022a740ab..3bafe1bd4dc7 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -41,11 +41,11 @@
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/vmalloc.h>
+#include <linux/pgtable.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
@@ -1073,13 +1073,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler);
int __init arch_populate_kprobe_blacklist(void)
{
- int ret;
-
- ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
- (unsigned long)__irqentry_text_end);
- if (ret)
- return ret;
-
return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
(unsigned long)__entry_text_end);
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index ea13f6888284..321c19950285 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -16,11 +16,11 @@
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
#include <linux/frame.h>
+#include <linux/pgtable.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
@@ -286,9 +286,7 @@ static int can_optimize(unsigned long paddr)
* stack handling and registers setup.
*/
if (((paddr >= (unsigned long)__entry_text_start) &&
- (paddr < (unsigned long)__entry_text_end)) ||
- ((paddr >= (unsigned long)__irqentry_text_start) &&
- (paddr < (unsigned long)__irqentry_text_end)))
+ (paddr < (unsigned long)__entry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d6f22a3a1f7d..df63786e7bfa 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -21,7 +21,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
-#include <linux/debugfs.h>
#include <linux/nmi.h>
#include <linux/swait.h>
#include <asm/timer.h>
@@ -218,7 +217,7 @@ again:
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
-u32 kvm_read_and_reset_apf_flags(void)
+noinstr u32 kvm_read_and_reset_apf_flags(void)
{
u32 flags = 0;
@@ -230,11 +229,11 @@ u32 kvm_read_and_reset_apf_flags(void)
return flags;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
-NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags);
-bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
u32 reason = kvm_read_and_reset_apf_flags();
+ bool rcu_exit;
switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
@@ -244,6 +243,9 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return false;
}
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ instrumentation_begin();
+
/*
* If the host managed to inject an async #PF into an interrupt
* disabled region, then die hard as this is not going to end well
@@ -258,13 +260,13 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
/* Page is swapped out by the host. */
kvm_async_pf_task_wait_schedule(token);
} else {
- rcu_irq_enter();
kvm_async_pf_task_wake(token);
- rcu_irq_exit();
}
+
+ instrumentation_end();
+ idtentry_exit_cond_rcu(regs, rcu_exit);
return true;
}
-NOKPROBE_SYMBOL(__kvm_handle_async_pf);
static void __init paravirt_ops_setup(void)
{
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 84c3ba32f211..8748321c4486 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -8,7 +8,7 @@
*
* Lock order:
* contex.ldt_usr_sem
- * mmap_sem
+ * mmap_lock
* context.lock
*/
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 02bddfc122a4..64b00b0d7fe8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -13,7 +13,6 @@
#include <linux/gfp.h>
#include <linux/io.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index ad5cdd6a5f23..a29a44a98e5b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -19,7 +19,6 @@
#include <linux/efi.h>
#include <asm/init.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io_apic.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 23c95a53d20e..34b153cbd4ac 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -22,7 +22,6 @@
#include <asm/text-patching.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/unwind.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bdcc5146de96..2de365f15684 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -303,7 +303,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static void default_do_nmi(struct pt_regs *regs)
+static noinstr void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -329,6 +329,9 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -342,7 +345,7 @@ static void default_do_nmi(struct pt_regs *regs)
*/
if (handled > 1)
__this_cpu_write(swallow_nmi, true);
- return;
+ goto out;
}
/*
@@ -374,7 +377,7 @@ static void default_do_nmi(struct pt_regs *regs)
#endif
__this_cpu_add(nmi_stats.external, 1);
raw_spin_unlock(&nmi_reason_lock);
- return;
+ goto out;
}
raw_spin_unlock(&nmi_reason_lock);
@@ -412,8 +415,12 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_add(nmi_stats.swallow, 1);
else
unknown_nmi_error(reason, regs);
+
+out:
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
}
-NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can page fault or hit breakpoints which will cause it to lose
@@ -467,44 +474,9 @@ enum nmi_states {
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+static DEFINE_PER_CPU(unsigned long, nmi_dr7);
-#ifdef CONFIG_X86_64
-/*
- * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
- * some care, the inner breakpoint will clobber the outer breakpoint's
- * stack.
- *
- * If a breakpoint is being processed, and the debug stack is being
- * used, if an NMI comes in and also hits a breakpoint, the stack
- * pointer will be set to the same fixed address as the breakpoint that
- * was interrupted, causing that stack to be corrupted. To handle this
- * case, check if the stack that was interrupted is the debug stack, and
- * if so, change the IDT so that new breakpoints will use the current
- * stack and not switch to the fixed address. On return of the NMI,
- * switch back to the original IDT.
- */
-static DEFINE_PER_CPU(int, update_debug_stack);
-
-static bool notrace is_debug_stack(unsigned long addr)
-{
- struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
- unsigned long top = CEA_ESTACK_TOP(cs, DB);
- unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
-
- if (__this_cpu_read(debug_stack_usage))
- return true;
- /*
- * Note, this covers the guard page between DB and DB1 as well to
- * avoid two checks. But by all means @addr can never point into
- * the guard page.
- */
- return addr >= bot && addr < top;
-}
-NOKPROBE_SYMBOL(is_debug_stack);
-#endif
-
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_RAW(exc_nmi)
{
if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
return;
@@ -517,18 +489,7 @@ do_nmi(struct pt_regs *regs, long error_code)
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
-#ifdef CONFIG_X86_64
- /*
- * If we interrupted a breakpoint, it is possible that
- * the nmi handler will have breakpoints too. We need to
- * change the IDT such that breakpoints that happen here
- * continue to use the NMI stack.
- */
- if (unlikely(is_debug_stack(regs->sp))) {
- debug_stack_set_zero();
- this_cpu_write(update_debug_stack, 1);
- }
-#endif
+ this_cpu_write(nmi_dr7, local_db_save());
nmi_enter();
@@ -539,12 +500,7 @@ nmi_restart:
nmi_exit();
-#ifdef CONFIG_X86_64
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-#endif
+ local_db_restore(this_cpu_read(nmi_dr7));
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
@@ -554,7 +510,6 @@ nmi_restart:
if (user_mode(regs))
mds_user_clear_cpu_buffers();
}
-NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5638e4ae2ea6..674a7d66d960 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -13,13 +13,13 @@
#include <linux/bcd.h>
#include <linux/highmem.h>
#include <linux/kprobes.h>
+#include <linux/pgtable.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8e3d0347b664..f362ce0d5ac0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,28 +545,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
lockdep_assert_irqs_disabled();
- /*
- * If TIF_SSBD is different, select the proper mitigation
- * method. Note that if SSBD mitigation is disabled or permanentely
- * enabled this branch can't be taken because nothing can set
- * TIF_SSBD.
- */
- if (tif_diff & _TIF_SSBD) {
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_ssb_virt_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
amd_set_core_ssb_state(tifn);
- } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD)) {
- msr |= ssbd_tif_to_spec_ctrl(tifn);
- updmsr = true;
- }
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
}
- /*
- * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
- * otherwise avoid the MSR write.
- */
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
if (IS_ENABLED(CONFIG_SMP) &&
static_branch_unlikely(&switch_to_cond_stibp)) {
updmsr |= !!(tif_diff & _TIF_SPEC_IB);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 538d4e8d6589..acfd6d2a0cbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -39,7 +39,6 @@
#include <linux/kdebug.h>
#include <linux/syscalls.h>
-#include <asm/pgtable.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0c169a5687e1..9a97415b2139 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -40,7 +40,6 @@
#include <linux/ftrace.h>
#include <linux/syscalls.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f0e1ddbc2fd7..44130588987f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -28,7 +28,6 @@
#include <linux/nospec.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 3ca43be4f9cf..0ec7ced727fe 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -11,13 +11,13 @@
#include <linux/tboot.h>
#include <linux/delay.h>
#include <linux/frame.h>
+#include <linux/pgtable.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
@@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
},
},
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
{ /* Handle problems with rebooting on Apple MacBookPro5 */
.callback = set_pci_reboot,
.ident = "Apple MacBookPro5",
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b8d4e9c3c070..eff4ce3b10da 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -27,6 +27,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
@@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
-
-asmlinkage __visible void smp_reboot_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
cpu_emergency_vmxoff();
stop_this_cpu(NULL);
- irq_exit();
}
static int register_stop_handler(void)
@@ -221,47 +220,33 @@ static void native_stop_other_cpus(int wait)
/*
* Reschedule call back. KVM uses this interrupt to force a cpu out of
- * guest mode
+ * guest mode.
*/
-__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
ack_APIC_irq();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
- kvm_set_cpu_l1tf_flush_l1d();
-
- if (trace_resched_ipi_enabled()) {
- /*
- * scheduler_ipi() might call irq_enter() as well, but
- * nested calls are fine.
- */
- irq_enter();
- trace_reschedule_entry(RESCHEDULE_VECTOR);
- scheduler_ipi();
- trace_reschedule_exit(RESCHEDULE_VECTOR);
- irq_exit();
- return;
- }
scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
}
-__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_interrupt();
trace_call_function_exit(CALL_FUNCTION_VECTOR);
- exiting_irq();
}
-__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
- ipi_entering_ack_irq();
+ ack_APIC_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
- exiting_irq();
}
static int __init nonmi_ipi_setup(char *str)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2467f3dd35d3..ffbd9a3d78d8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -55,6 +55,7 @@
#include <linux/gfp.h>
#include <linux/cpuidle.h>
#include <linux/numa.h>
+#include <linux/pgtable.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -63,7 +64,6 @@
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/mwait.h>
diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c
index ab03fede1422..f8d65c99feb8 100644
--- a/arch/x86/kernel/sys_ia32.c
+++ b/arch/x86/kernel/sys_ia32.c
@@ -135,26 +135,30 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
typeof(ubuf->st_gid) gid = 0;
SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
- if (!access_ok(ubuf, sizeof(struct stat64)) ||
- __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
- __put_user(stat->ino, &ubuf->__st_ino) ||
- __put_user(stat->ino, &ubuf->st_ino) ||
- __put_user(stat->mode, &ubuf->st_mode) ||
- __put_user(stat->nlink, &ubuf->st_nlink) ||
- __put_user(uid, &ubuf->st_uid) ||
- __put_user(gid, &ubuf->st_gid) ||
- __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
- __put_user(stat->size, &ubuf->st_size) ||
- __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
- __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
- __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
- __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
- __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
- __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
- __put_user(stat->blksize, &ubuf->st_blksize) ||
- __put_user(stat->blocks, &ubuf->st_blocks))
+ if (!user_write_access_begin(ubuf, sizeof(struct stat64)))
return -EFAULT;
+ unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault);
+ unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault);
+ unsafe_put_user(stat->ino, &ubuf->st_ino, Efault);
+ unsafe_put_user(stat->mode, &ubuf->st_mode, Efault);
+ unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault);
+ unsafe_put_user(uid, &ubuf->st_uid, Efault);
+ unsafe_put_user(gid, &ubuf->st_gid, Efault);
+ unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault);
+ unsafe_put_user(stat->size, &ubuf->st_size, Efault);
+ unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault);
+ unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault);
+ unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault);
+ unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault);
+ unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault);
+ unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault);
+ unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault);
+ unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault);
+ user_access_end();
return 0;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
}
COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename,
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b2942b2dbfcf..992fb1415c0f 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -23,7 +23,6 @@
#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/swiotlb.h>
#include <asm/fixmap.h>
@@ -94,7 +93,7 @@ static struct mm_struct tboot_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
- .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
};
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 371a6b348e44..e42faa792c07 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -25,10 +25,6 @@
#include <asm/hpet.h>
#include <asm/time.h>
-#ifdef CONFIG_X86_64
-__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-#endif
-
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 496748ed266a..fcfc077afe2d 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -25,20 +25,3 @@ void trace_pagefault_unreg(void)
{
static_branch_dec(&trace_pagefault_key);
}
-
-#ifdef CONFIG_SMP
-
-DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
-
-int trace_resched_ipi_reg(void)
-{
- static_branch_inc(&trace_resched_ipi_key);
- return 0;
-}
-
-void trace_resched_ipi_unreg(void)
-{
- static_branch_dec(&trace_resched_ipi_key);
-}
-
-#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4cc541051994..af75109485c2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -97,24 +97,6 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-int fixup_bug(struct pt_regs *regs, int trapnr)
-{
- if (trapnr != X86_TRAP_UD)
- return 0;
-
- switch (report_bug(regs->ip, regs)) {
- case BUG_TRAP_TYPE_NONE:
- case BUG_TRAP_TYPE_BUG:
- break;
-
- case BUG_TRAP_TYPE_WARN:
- regs->ip += LEN_UD2;
- return 1;
- }
-
- return 0;
-}
-
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
@@ -145,7 +127,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
+ * delivered, faults. See also exc_general_protection below.
*/
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
@@ -190,41 +172,119 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- /*
- * WARN*()s end up here; fix them up before we call the
- * notifier chain.
- */
- if (!user_mode(regs) && fixup_bug(regs, trapnr))
- return;
-
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
+ cond_local_irq_disable(regs);
}
}
-#define IP ((void __user *)uprobe_get_trap_addr(regs))
-#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+ return (void __user *)uprobe_get_trap_addr(regs);
}
-DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error)
-DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow)
-DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op)
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
-DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
-#undef IP
+DEFINE_IDTENTRY(exc_divide_error)
+{
+ do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE,
+ FPE_INTDIV, error_get_trap_addr(regs));
+}
-dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_overflow)
{
- char *str = "alignment check";
+ do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
+}
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+#ifdef CONFIG_X86_F00F_BUG
+void handle_invalid_op(struct pt_regs *regs)
+#else
+static inline void handle_invalid_op(struct pt_regs *regs)
+#endif
+{
+ do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
+ ILL_ILLOPN, error_get_trap_addr(regs));
+}
+
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
+{
+ bool rcu_exit;
+
+ /*
+ * Handle BUG/WARN like NMIs instead of like normal idtentries:
+ * if we bugged/warned in a bad RCU context, for example, the last
+ * thing we want is to BUG/WARN again in the idtentry code, ad
+ * infinitum.
+ */
+ if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) {
+ enum bug_trap_type type;
+
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ type = report_bug(regs->ip, regs);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ nmi_exit();
+
+ if (type == BUG_TRAP_TYPE_WARN) {
+ /* Skip the ud2. */
+ regs->ip += LEN_UD2;
+ return;
+ }
+
+ /*
+ * Else, if this was a BUG and report_bug returns or if this
+ * was just a normal #UD, we want to continue onward and
+ * crash.
+ */
+ }
+
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ instrumentation_begin();
+ handle_invalid_op(regs);
+ instrumentation_end();
+ idtentry_exit_cond_rcu(regs, rcu_exit);
+}
+
+DEFINE_IDTENTRY(exc_coproc_segment_overrun)
+{
+ do_error_trap(regs, 0, "coprocessor segment overrun",
+ X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
+{
+ do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
+ 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
+{
+ do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
+ SIGBUS, 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
+{
+ do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
+ 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
+{
+ char *str = "alignment check";
if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
@@ -271,12 +331,19 @@ __visible void __noreturn handle_stack_overflow(const char *message,
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
+ *
+ * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
+ * to be read before doing anything else.
*/
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
+DEFINE_IDTENTRY_DF(exc_double_fault)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long address = read_cr2();
+#endif
+
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -299,6 +366,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ unsigned long *p = (unsigned long *)regs->sp;
/*
* regs->sp points to the failing IRET frame on the
@@ -306,7 +374,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* in gpregs->ss through gpregs->ip.
*
*/
- memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->ip = p[0];
+ gpregs->cs = p[1];
+ gpregs->flags = p[2];
+ gpregs->sp = p[3];
+ gpregs->ss = p[4];
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
@@ -320,7 +392,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* which is what the stub expects, given that the faulting
* RIP will be the IRET instruction.
*/
- regs->ip = (unsigned long)general_protection;
+ regs->ip = (unsigned long)asm_exc_general_protection;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
@@ -328,6 +400,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
#endif
nmi_enter();
+ instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -371,27 +444,31 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
- handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+ if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
+ handle_stack_overflow("kernel stack overflow (double-fault)",
+ regs, address);
+ }
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
panic("Machine halted.");
+ instrumentation_end();
}
-dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_bounds)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- if (notify_die(DIE_TRAP, "bounds", regs, error_code,
+ if (notify_die(DIE_TRAP, "bounds", regs, 0,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
return;
cond_local_irq_enable(regs);
if (!user_mode(regs))
- die("bounds", regs, error_code);
+ die("bounds", regs, 0);
+
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+ cond_local_irq_disable(regs);
}
enum kernel_gp_hint {
@@ -438,7 +515,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
-dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
@@ -446,17 +523,17 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
unsigned long gp_addr;
int ret;
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
- return;
+ goto exit;
}
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ local_irq_disable();
return;
}
@@ -468,12 +545,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
-
- return;
+ goto exit;
}
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ goto exit;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -485,11 +561,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
if (!preemptible() &&
kprobe_running() &&
kprobe_fault_handler(regs, X86_TRAP_GP))
- return;
+ goto exit;
ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
if (ret == NOTIFY_STOP)
- return;
+ goto exit;
if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
@@ -511,47 +587,74 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
die_addr(desc, regs, error_code, gp_addr);
+exit:
+ cond_local_irq_disable(regs);
}
-NOKPROBE_SYMBOL(do_general_protection);
-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+static bool do_int3(struct pt_regs *regs)
{
- if (poke_int3_handler(regs))
- return;
-
- /*
- * Unlike any other non-IST entry, we can be called from pretty much
- * any location in the kernel through kprobes -- text_poke() will most
- * likely be handled by poke_int3_handler() above. This means this
- * handler is effectively NMI-like.
- */
- if (!user_mode(regs))
- nmi_enter();
+ int res;
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
- goto exit;
+ return true;
#endif
+ res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
- if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
+ return res == NOTIFY_STOP;
+}
+
+static void do_int3_user(struct pt_regs *regs)
+{
+ if (do_int3(regs))
+ return;
cond_local_irq_enable(regs);
- do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
cond_local_irq_disable(regs);
+}
-exit:
- if (!user_mode(regs))
+DEFINE_IDTENTRY_RAW(exc_int3)
+{
+ /*
+ * poke_int3_handler() is completely self contained code; it does (and
+ * must) *NOT* call out to anything, lest it hits upon yet another
+ * INT3.
+ */
+ if (poke_int3_handler(regs))
+ return;
+
+ /*
+ * idtentry_enter_user() uses static_branch_{,un}likely() and therefore
+ * can trigger INT3, hence poke_int3_handler() must be done
+ * before. If the entry came from kernel mode, then use nmi_enter()
+ * because the INT3 could have been hit in any context including
+ * NMI.
+ */
+ if (user_mode(regs)) {
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+ do_int3_user(regs);
+ instrumentation_end();
+ idtentry_exit_user(regs);
+ } else {
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ if (!do_int3(regs))
+ die("int3", regs, 0);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
nmi_exit();
+ }
}
-NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -559,21 +662,20 @@ NOKPROBE_SYMBOL(do_int3);
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
-asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
-NOKPROBE_SYMBOL(sync_regs);
struct bad_iret_stack {
void *error_entry_ret;
struct pt_regs regs;
};
-asmlinkage __visible notrace
+asmlinkage __visible noinstr
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
{
/*
@@ -584,19 +686,21 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
- struct bad_iret_stack *new_stack =
- (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ struct bad_iret_stack tmp, *new_stack =
+ (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
- /* Copy the IRET target to the new stack. */
- memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+ /* Copy the IRET target to the temporary storage. */
+ memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+ memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ /* Update the entry stack */
+ memcpy(new_stack, &tmp, sizeof(tmp));
BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
-NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
static bool is_sysenter_singlestep(struct pt_regs *regs)
@@ -622,6 +726,43 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif
}
+static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
+{
+ /*
+ * Disable breakpoints during exception handling; recursive exceptions
+ * are exceedingly 'fun'.
+ *
+ * Since this function is NOKPROBE, and that also applies to
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+ * HW_BREAKPOINT_W on our stack)
+ *
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
+ * includes the entry stack is excluded for everything.
+ */
+ *dr7 = local_db_save();
+
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3. The remaining
+ * contents of the DR6 register are never cleared by the
+ * processor. To avoid confusion in identifying debug
+ * exceptions, debug handlers should clear the register before
+ * returning to the interrupted task.
+ *
+ * Keep it simple: clear DR6 immediately.
+ */
+ get_debugreg(*dr6, 6);
+ set_debugreg(0, 6);
+ /* Filter out all the reserved bits which are preset to 1 */
+ *dr6 &= ~DR6_RESERVED;
+}
+
+static __always_inline void debug_exit(unsigned long dr7)
+{
+ local_db_restore(dr7);
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -646,86 +787,54 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
*
* May run on IST stack.
*/
-dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
+static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
{
struct task_struct *tsk = current;
- int user_icebp = 0;
- unsigned long dr6;
+ bool user_icebp;
int si_code;
- nmi_enter();
-
- get_debugreg(dr6, 6);
- /*
- * The Intel SDM says:
- *
- * Certain debug exceptions may clear bits 0-3. The remaining
- * contents of the DR6 register are never cleared by the
- * processor. To avoid confusion in identifying debug
- * exceptions, debug handlers should clear the register before
- * returning to the interrupted task.
- *
- * Keep it simple: clear DR6 immediately.
- */
- set_debugreg(0, 6);
-
- /* Filter out all the reserved bits which are preset to 1 */
- dr6 &= ~DR6_RESERVED;
-
/*
* The SDM says "The processor clears the BTF flag when it
* generates a debug exception." Clear TIF_BLOCKSTEP to keep
* TIF_BLOCKSTEP in sync with the hardware BTF flag.
*/
- clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+ clear_thread_flag(TIF_BLOCKSTEP);
- if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
- is_sysenter_singlestep(regs))) {
- dr6 &= ~DR_STEP;
- if (!dr6)
- goto exit;
- /*
- * else we might have gotten a single-step trap and hit a
- * watchpoint at the same time, in which case we should fall
- * through and handle the watchpoint.
- */
- }
+ /*
+ * If DR6 is zero, no point in trying to handle it. The kernel is
+ * not using INT1.
+ */
+ if (!user && !dr6)
+ return;
/*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
- if (!dr6 && user_mode(regs))
- user_icebp = 1;
+ user_icebp = user && !dr6;
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
#ifdef CONFIG_KPROBES
- if (kprobe_debug_handler(regs))
- goto exit;
+ if (kprobe_debug_handler(regs)) {
+ return;
+ }
#endif
- if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
- SIGTRAP) == NOTIFY_STOP)
- goto exit;
-
- /*
- * Let others (NMI) know that the debug stack is in use
- * as we may switch to the interrupt stack.
- */
- debug_stack_usage_inc();
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0,
+ SIGTRAP) == NOTIFY_STOP) {
+ return;
+ }
/* It's safe to allow irq's after DR6 has been saved */
cond_local_irq_enable(regs);
if (v8086_mode(regs)) {
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
- X86_TRAP_DB);
- cond_local_irq_disable(regs);
- debug_stack_usage_dec();
- goto exit;
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, 0,
+ X86_TRAP_DB);
+ goto out;
}
if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
@@ -739,23 +848,91 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
}
+
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
- send_sigtrap(regs, error_code, si_code);
+ send_sigtrap(regs, 0, si_code);
+
+out:
cond_local_irq_disable(regs);
- debug_stack_usage_dec();
+}
-exit:
+static __always_inline void exc_debug_kernel(struct pt_regs *regs,
+ unsigned long dr6)
+{
+ nmi_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+
+ /*
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
+ * watchpoint at the same time then that will still be handled.
+ */
+ if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ dr6 &= ~DR_STEP;
+
+ handle_debug(regs, dr6, false);
+
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
nmi_exit();
}
-NOKPROBE_SYMBOL(do_debug);
+
+static __always_inline void exc_debug_user(struct pt_regs *regs,
+ unsigned long dr6)
+{
+ idtentry_enter_user(regs);
+ instrumentation_begin();
+
+ handle_debug(regs, dr6, true);
+ instrumentation_end();
+ idtentry_exit_user(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* IST stack entry */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+ exc_debug_kernel(regs, dr6);
+ debug_exit(dr7);
+}
+
+/* User entry, runs on regular task stack */
+DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+ exc_debug_user(regs, dr6);
+ debug_exit(dr7);
+}
+#else
+/* 32 bit does not have separate entry points. */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
+{
+ unsigned long dr6, dr7;
+
+ debug_enter(&dr6, &dr7);
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
+
+ debug_exit(dr7);
+}
+#endif
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-static void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int trapnr)
{
struct task_struct *task = current;
struct fpu *fpu = &task->thread.fpu;
@@ -766,16 +943,16 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
- if (fixup_exception(regs, trapnr, error_code, 0))
- return;
+ if (fixup_exception(regs, trapnr, 0, 0))
+ goto exit;
- task->thread.error_code = error_code;
+ task->thread.error_code = 0;
task->thread.trap_nr = trapnr;
- if (notify_die(DIE_TRAP, str, regs, error_code,
- trapnr, SIGFPE) != NOTIFY_STOP)
- die(str, regs, error_code);
- return;
+ if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
+ SIGFPE) != NOTIFY_STOP)
+ die(str, regs, 0);
+ goto exit;
}
/*
@@ -784,32 +961,37 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
fpu__save(fpu);
task->thread.trap_nr = trapnr;
- task->thread.error_code = error_code;
+ task->thread.error_code = 0;
si_code = fpu__exception_code(fpu, trapnr);
/* Retry when we get spurious exceptions: */
if (!si_code)
- return;
+ goto exit;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
+exit:
+ cond_local_irq_disable(regs);
}
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_coprocessor_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- math_error(regs, error_code, X86_TRAP_MF);
+ math_error(regs, X86_TRAP_MF);
}
-dotraplinkage void
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- math_error(regs, error_code, X86_TRAP_XF);
+ if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
+ /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
+ if (!static_cpu_has(X86_FEATURE_XMM)) {
+ __exc_general_protection(regs, 0);
+ return;
+ }
+ }
+ math_error(regs, X86_TRAP_XF);
}
-dotraplinkage void
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
{
/*
* This addresses a Pentium Pro Erratum:
@@ -832,13 +1014,10 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
*/
}
-dotraplinkage void
-do_device_not_available(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
@@ -847,6 +1026,8 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
+
+ cond_local_irq_disable(regs);
return;
}
#endif
@@ -861,22 +1042,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
* to kill the task than getting stuck in a never-ending
* loop of #NM faults.
*/
- die("unexpected #NM exception", regs, error_code);
+ die("unexpected #NM exception", regs, 0);
}
}
-NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_SW(iret_error)
{
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
-
- if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ if (notify_die(DIE_TRAP, "iret exception", regs, 0,
X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
- do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
ILL_BADSTK, (void __user *)NULL);
}
+ local_irq_disable();
}
#endif
@@ -888,20 +1067,9 @@ void __init trap_init(void)
idt_setup_traps();
/*
- * Set the IDT descriptor to a fixed read-only location, so that the
- * "sidt" instruction will not leak the location of the kernel, and
- * to defend the IDT against arbitrary memory write vulnerabilities.
- * It will be reloaded in cpu_init() */
- cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
- PAGE_KERNEL_RO);
- idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
-
- /*
* Should be a barrier for any external CPU state:
*/
cpu_init();
idt_setup_ist_traps();
-
- idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 54226110bc7f..722a85f3b2dd 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip)
{
char *addr = (char *)ip;
- if (addr >= __entry_text_start && addr < __entry_text_end)
- return true;
-
- if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
- return true;
-
- return false;
+ return addr >= __entry_text_start && addr < __entry_text_end;
}
static inline unsigned long *last_frame(struct unwind_state *state)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 47a8676c7395..764573de3996 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -171,7 +171,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_t *pte;
int i;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
@@ -197,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
}
pte_unmap_unlock(pte, ptl);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1bf7e312361f..3bfc8dd8a43d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -40,13 +40,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
-jiffies = jiffies_64;
#else
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
#endif
+jiffies = jiffies_64;
+
#if defined(CONFIG_X86_64)
/*
* On 64-bit, align RODATA to 2MB so we retain large page mappings for
@@ -134,7 +134,6 @@ SECTIONS
KPROBES_TEXT
ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
- IRQENTRY_TEXT
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)