summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2021-05-17 09:55:12 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2021-05-17 09:55:12 +0200
commita4345a7cecfb91ae78cd43d26b0c6a956420761a (patch)
treeb0cf6296f791fa250ccc9d419c46f48fb73c331b /arch/x86
parentce7ea0cfdc2e9ff31d12da31c3226deddb9644f5 (diff)
parentcb853ded1d25e5b026ce115dbcde69e3d7e2e831 (diff)
Merge tag 'kvmarm-fixes-5.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 fixes for 5.13, take #1 - Fix regression with irqbypass not restarting the guest on failed connect - Fix regression with debug register decoding resulting in overlapping access - Commit exception state on exit to usrspace - Fix the MMU notifier return values - Add missing 'static' qualifiers in the new host stage-2 code
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig29
-rw-r--r--arch/x86/events/amd/iommu.c47
-rw-r--r--arch/x86/include/asm/bug.h9
-rw-r--r--arch/x86/include/asm/idtentry.h15
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/page_64.h33
-rw-r--r--arch/x86/include/asm/page_64_types.h23
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c2
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/process.c19
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kvm/svm/svm.c39
-rw-r--r--arch/x86/kvm/vmx/vmx.c55
-rw-r--r--arch/x86/kvm/x86.c9
-rw-r--r--arch/x86/kvm/x86.h45
-rw-r--r--arch/x86/mm/pat/set_memory.c8
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/um/Makefile1
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h2
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c4
22 files changed, 197 insertions, 168 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dac15f646f79..0045e1b44190 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,7 +60,13 @@ config X86
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
+ select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
+ select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+ select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
+ select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -165,6 +171,7 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
+ select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_WITHIN_STACK_FRAMES
@@ -315,9 +322,6 @@ config GENERIC_CALIBRATE_DELAY
config ARCH_HAS_CPU_RELAX
def_bool y
-config ARCH_HAS_CACHE_LINE_SIZE
- def_bool y
-
config ARCH_HAS_FILTER_PGPROT
def_bool y
@@ -2428,30 +2432,13 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
-config ARCH_ENABLE_MEMORY_HOTPLUG
- def_bool y
- depends on X86_64 || (X86_32 && HIGHMEM)
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
def_bool y
- depends on MEMORY_HOTPLUG
config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- def_bool y
- depends on X86_64 || X86_PAE
-
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
- def_bool y
- depends on X86_64 && HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
- def_bool y
- depends on X86_64 && TRANSPARENT_HUGEPAGE
-
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 1c1a7e45dc64..913745f1419b 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -19,8 +19,6 @@
#include "../perf_event.h"
#include "iommu.h"
-#define COUNTER_SHIFT 16
-
/* iommu pmu conf masks */
#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
@@ -286,22 +284,31 @@ static void perf_iommu_start(struct perf_event *event, int flags)
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
+ /*
+ * To account for power-gating, which prevents write to
+ * the counter, we need to enable the counter
+ * before setting up counter register.
+ */
+ perf_iommu_enable_event(event);
+
if (flags & PERF_EF_RELOAD) {
- u64 prev_raw_count = local64_read(&hwc->prev_count);
+ u64 count = 0;
struct amd_iommu *iommu = perf_event_2_iommu(event);
+ /*
+ * Since the IOMMU PMU only support counting mode,
+ * the counter always start with value zero.
+ */
amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
- IOMMU_PC_COUNTER_REG, &prev_raw_count);
+ IOMMU_PC_COUNTER_REG, &count);
}
- perf_iommu_enable_event(event);
perf_event_update_userpage(event);
-
}
static void perf_iommu_read(struct perf_event *event)
{
- u64 count, prev, delta;
+ u64 count;
struct hw_perf_event *hwc = &event->hw;
struct amd_iommu *iommu = perf_event_2_iommu(event);
@@ -312,14 +319,11 @@ static void perf_iommu_read(struct perf_event *event)
/* IOMMU pc counter register is only 48 bits */
count &= GENMASK_ULL(47, 0);
- prev = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
- return;
-
- /* Handle 48-bit counter overflow */
- delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
- delta >>= COUNTER_SHIFT;
- local64_add(delta, &event->count);
+ /*
+ * Since the counter always start with value zero,
+ * simply just accumulate the count for the event.
+ */
+ local64_add(count, &event->count);
}
static void perf_iommu_stop(struct perf_event *event, int flags)
@@ -329,15 +333,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
if (hwc->state & PERF_HES_UPTODATE)
return;
+ /*
+ * To account for power-gating, in which reading the counter would
+ * return zero, we need to read the register before disabling.
+ */
+ perf_iommu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+
perf_iommu_disable_event(event);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
-
- if (hwc->state & PERF_HES_UPTODATE)
- return;
-
- perf_iommu_read(event);
- hwc->state |= PERF_HES_UPTODATE;
}
static int perf_iommu_add(struct perf_event *event, int flags)
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 297fa12e7e27..84b87538a15d 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -7,18 +7,9 @@
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
- *
- * Since various instruction decoders/specs disagree on the encoding of
- * UD0/UD1.
*/
-
-#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
-#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
#define ASM_UD2 ".byte 0x0f, 0x0b"
-
-#define INSN_UD0 0xff0f
#define INSN_UD2 0x0b0f
-
#define LEN_UD2 2
#ifdef CONFIG_GENERIC_BUG
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index e35e342673c7..73d45b0dfff2 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif
/* NMI */
+
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * Special NOIST entry point for VMX which invokes this on the kernel
+ * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
+ * 'executing' marker.
+ *
+ * On 32bit this just uses the regular NMI entry point because 32-bit does
+ * not have ISTs.
+ */
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
+#else
+#define asm_exc_nmi_noist asm_exc_nmi
+#endif
+
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e16cccdd0420..a3f87f1015d3 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
}
-#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
-
-#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
-
struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
int msr_set_bit(u32 msr, u8 bit);
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..ca840fec7776 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -56,6 +56,39 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from);
+#ifdef CONFIG_X86_5LEVEL
+/*
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+static inline unsigned long task_size_max(void)
+{
+ unsigned long ret;
+
+ alternative_io("movq %[small],%0","movq %[large],%0",
+ X86_FEATURE_LA57,
+ "=r" (ret),
+ [small] "i" ((1ul << 47)-PAGE_SIZE),
+ [large] "i" ((1ul << 56)-PAGE_SIZE));
+
+ return ret;
+}
+#endif /* CONFIG_X86_5LEVEL */
+
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 64297eabad63..a8d4ad856568 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -55,30 +55,13 @@
#ifdef CONFIG_X86_5LEVEL
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
+/* See task_size_max() in <asm/page_64.h> */
#else
#define __VIRTUAL_MASK_SHIFT 47
+#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
#endif
-/*
- * User space process size. This is the first address outside the user range.
- * There are a few constraints that determine this:
- *
- * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- * address, then that syscall will enter the kernel with a
- * non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything
- * from being mapped at the maximum canonical address.
- *
- * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- * CPUs malfunction if they execute code from the highest canonical page.
- * They'll speculate right off the end of the canonical space, and
- * bad things happen. This is worked around in the same way as the
- * Intel problem.
- *
- * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
-#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
-
+#define TASK_SIZE_MAX task_size_max()
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
/* This decides where the kernel will search for a free chunk of vm
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6bdb69a9a7dc..a1b756c49a93 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1851,8 +1851,8 @@ static inline void setup_getcpu(int cpu)
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
- if (boot_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux(cpudata);
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ wrmsr(MSR_TSC_AUX, cpudata, 0);
/* Store CPU and node number in limit. */
d.limit0 = cpudata;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index dbeaa8409313..f07c10b87a87 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
static const struct mbm_correction_factor_table {
u32 rmidthreshold;
u64 cf;
-} mbm_cf_table[] __initdata = {
+} mbm_cf_table[] __initconst = {
{7, CF(1.000000)},
{15, CF(1.000000)},
{15, CF(0.969650)},
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bf250a339655..2ef961cf4cfc 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -524,6 +524,16 @@ nmi_restart:
mds_user_clear_cpu_buffers();
}
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+{
+ exc_nmi(regs);
+}
+#endif
+#if IS_MODULE(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+#endif
+
void stop_nmi(void)
{
ignore_nmis++;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 43cbfc84153a..5e1f38179f49 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
#endif
/* Kernel thread ? */
- if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
@@ -172,6 +172,23 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
task_user_gs(p) = get_user_gs(current_pt_regs());
#endif
+ if (unlikely(p->flags & PF_IO_WORKER)) {
+ /*
+ * An IO thread is a user space thread, but it doesn't
+ * return to ret_after_fork().
+ *
+ * In order to indicate that to tools like gdb,
+ * we reset the stack and instruction pointers.
+ *
+ * It does the same kernel frame setup to return to a kernel
+ * function that a kernel thread does.
+ */
+ childregs->sp = 0;
+ childregs->ip = 0;
+ kthread_frame_init(frame, sp, arg);
+ return 0;
+ }
+
/* Set a new TLS for the child thread? */
if (clone_flags & CLONE_SETTLS)
ret = set_new_tls(p, tls);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7ffb0cf3f997..0ad5214f598a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
#define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4dd9b7856e5b..dfa351e605de 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3705,25 +3705,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long vmcb_pa = svm->current_vmcb->pa;
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ kvm_guest_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
__svm_sev_es_vcpu_run(vmcb_pa);
@@ -3743,24 +3725,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
vmload(__sme_page_pa(sd->save_area));
}
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ kvm_guest_exit_irqoff();
}
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f2fd447eed45..4bceb5ca3a89 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
@@ -6410,18 +6411,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+ unsigned long entry)
{
- unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
kvm_before_interrupt(vcpu);
- vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
+ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6432,18 +6432,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6657,25 +6659,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ kvm_guest_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6691,24 +6675,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ kvm_guest_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bd550eaf683..9b6bca616929 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9386,6 +9386,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_disable();
kvm_after_interrupt(vcpu);
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ vtime_account_guest_exit();
+
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8ddd38146525..521f74e5bbf2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,51 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+static __always_inline void kvm_guest_enter_irqoff(void)
+{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+static __always_inline void kvm_guest_exit_irqoff(void)
+{
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * context_tracking_guest_exit() restores host context and reinstates
+ * RCU if enabled and required.
+ *
+ * This needs to be done immediately after VM-Exit, before any code
+ * that might contain tracepoints or call out to the greater world,
+ * e.g. before x86_spec_ctrl_restore_host().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ context_tracking_guest_exit();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 427980617557..156cd235659f 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -91,6 +93,12 @@ static void split_page_count(int level)
return;
direct_pages_count[level]--;
+ if (system_state == SYSTEM_RUNNING) {
+ if (level == PG_LEVEL_2M)
+ count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+ else if (level == PG_LEVEL_1G)
+ count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+ }
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index bfa50e65ef6c..ae744b6a0785 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -126,7 +126,7 @@ static int __init early_root_info_init(void)
node = (reg >> 4) & 0x07;
link = (reg >> 8) & 0x03;
- info = alloc_pci_root_info(min_bus, max_bus, node, link);
+ alloc_pci_root_info(min_bus, max_bus, node, link);
}
/*
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 77f70b969d14..5ccb18290d71 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
+subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
subarch-y += ../kernel/sys_ia32.o
else
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index c907b20d4993..dcaf3b38a9e0 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -212,6 +212,6 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
-#define SET_PERSONALITY(ex) do ; while(0)
+#define SET_PERSONALITY(ex) do {} while(0)
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index c3891c1ada26..b95db9daf0e8 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -77,7 +77,7 @@ static inline void trap_myself(void)
__asm("int3");
}
-static void inline remap_stack_and_trap(void)
+static inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movl %%esp,%%ebx ;"
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 19ae3e4fe4e9..54f9aa7e8457 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -59,7 +59,7 @@ int __init pci_xen_swiotlb_detect(void)
void __init pci_xen_swiotlb_init(void)
{
if (xen_swiotlb) {
- xen_swiotlb_init(1, true /* early */);
+ xen_swiotlb_init_early();
dma_ops = &xen_swiotlb_dma_ops;
#ifdef CONFIG_PCI
@@ -76,7 +76,7 @@ int pci_xen_swiotlb_init_late(void)
if (xen_swiotlb)
return 0;
- rc = xen_swiotlb_init(1, false /* late */);
+ rc = xen_swiotlb_init();
if (rc)
return rc;