diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-11-01 09:43:32 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-11-01 09:43:32 -0800 |
commit | 2d38c80d5bafecdd3bdb0d22b722afba8101ec1f (patch) | |
tree | fe995777da6edbaa47a170807da500419b4e3584 | |
parent | c2dc4c073fb71b50904493657a7622b481b346e3 (diff) | |
parent | 9478dec3b5e79a1431e2e2b911e32e52a11c6320 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
"ARM:
- selftest fix
- force PTE mapping on device pages provided via VFIO
- fix detection of cacheable mapping at S2
- fallback to PMD/PTE mappings for composite huge pages
- fix accounting of Stage-2 PGD allocation
- fix AArch32 handling of some of the debug registers
- simplify host HYP entry
- fix stray pointer conversion on nVHE TLB invalidation
- fix initialization of the nVHE code
- simplify handling of capabilities exposed to HYP
- nuke VCPUs caught using a forbidden AArch32 EL0
x86:
- new nested virtualization selftest
- miscellaneous fixes
- make W=1 fixes
- reserve new CPUID bit in the KVM leaves"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: vmx: remove unused variable
KVM: selftests: Don't require THP to run tests
KVM: VMX: eVMCS: make evmcs_sanitize_exec_ctrls() work again
KVM: selftests: test behavior of unmapped L2 APIC-access address
KVM: x86: Fix NULL dereference at kvm_msr_ignored_check()
KVM: x86: replace static const variables with macros
KVM: arm64: Handle Asymmetric AArch32 systems
arm64: cpufeature: upgrade hyp caps to final
arm64: cpufeature: reorder cpus_have_{const, final}_cap()
KVM: arm64: Factor out is_{vhe,nvhe}_hyp_code()
KVM: arm64: Force PTE mapping on fault resulting in a device mapping
KVM: arm64: Use fallback mapping sizes for contiguous huge page sizes
KVM: arm64: Fix masks in stage2_pte_cacheable()
KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR
KVM: arm64: Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT
KVM: arm64: Drop useless PAN setting on host EL1 to EL2 transition
KVM: arm64: Remove leftover kern_hyp_va() in nVHE TLB invalidation
KVM: arm64: Don't corrupt tpidr_el2 on failed HVC call
x86/kvm: Reserve KVM_FEATURE_MSI_EXT_DEST_ID
26 files changed, 306 insertions, 76 deletions
diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index 7d81c0aa4a59..cf62162d4be2 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -92,6 +92,10 @@ KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit async pf acknowledgment msr 0x4b564d07. +KVM_FEATURE_MSI_EXT_DEST_ID 15 guest checks this feature bit + before using extended destination + ID bits in MSI address bits 11-5. + KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side per-cpu warps are expected in kvmclock diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f7e7144af174..97244d4feca9 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -375,6 +375,23 @@ cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry, return false; } +static __always_inline bool is_vhe_hyp_code(void) +{ + /* Only defined for code run in VHE hyp context */ + return __is_defined(__KVM_VHE_HYPERVISOR__); +} + +static __always_inline bool is_nvhe_hyp_code(void) +{ + /* Only defined for code run in NVHE hyp context */ + return __is_defined(__KVM_NVHE_HYPERVISOR__); +} + +static __always_inline bool is_hyp_code(void) +{ + return is_vhe_hyp_code() || is_nvhe_hyp_code(); +} + extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); extern struct static_key_false cpu_hwcap_keys[ARM64_NCAPS]; extern struct static_key_false arm64_const_caps_ready; @@ -428,35 +445,40 @@ static __always_inline bool __cpus_have_const_cap(int num) } /* - * Test for a capability, possibly with a runtime check. + * Test for a capability without a runtime check. * - * Before capabilities are finalized, this behaves as cpus_have_cap(). + * Before capabilities are finalized, this will BUG(). * After capabilities are finalized, this is patched to avoid a runtime check. * * @num must be a compile-time constant. */ -static __always_inline bool cpus_have_const_cap(int num) +static __always_inline bool cpus_have_final_cap(int num) { if (system_capabilities_finalized()) return __cpus_have_const_cap(num); else - return cpus_have_cap(num); + BUG(); } /* - * Test for a capability without a runtime check. + * Test for a capability, possibly with a runtime check for non-hyp code. * - * Before capabilities are finalized, this will BUG(). + * For hyp code, this behaves the same as cpus_have_final_cap(). + * + * For non-hyp code: + * Before capabilities are finalized, this behaves as cpus_have_cap(). * After capabilities are finalized, this is patched to avoid a runtime check. * * @num must be a compile-time constant. */ -static __always_inline bool cpus_have_final_cap(int num) +static __always_inline bool cpus_have_const_cap(int num) { - if (system_capabilities_finalized()) + if (is_hyp_code()) + return cpus_have_final_cap(num); + else if (system_capabilities_finalized()) return __cpus_have_const_cap(num); else - BUG(); + return cpus_have_cap(num); } static inline void cpus_set_cap(unsigned int num) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0aecbab6a7fb..781d029b8aa8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -239,6 +239,7 @@ enum vcpu_sysreg { #define cp14_DBGWCR0 (DBGWCR0_EL1 * 2) #define cp14_DBGWVR0 (DBGWVR0_EL1 * 2) #define cp14_DBGDCCINT (MDCCINT_EL1 * 2) +#define cp14_DBGVCR (DBGVCR32_EL2 * 2) #define NR_COPRO_REGS (NR_SYS_REGS * 2) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 09977acc007d..6069be50baf9 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -86,13 +86,12 @@ static inline bool is_kernel_in_hyp_mode(void) static __always_inline bool has_vhe(void) { /* - * The following macros are defined for code specic to VHE/nVHE. - * If has_vhe() is inlined into those compilation units, it can - * be determined statically. Otherwise fall back to caps. + * Code only run in VHE/NVHE hyp context can assume VHE is present or + * absent. Otherwise fall back to caps. */ - if (__is_defined(__KVM_VHE_HYPERVISOR__)) + if (is_vhe_hyp_code()) return true; - else if (__is_defined(__KVM_NVHE_HYPERVISOR__)) + else if (is_nvhe_hyp_code()) return false; else return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 61684a500914..c615b285ff5b 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -87,7 +87,6 @@ KVM_NVHE_ALIAS(__icache_flags); /* Kernel symbols needed for cpus_have_final/const_caps checks. */ KVM_NVHE_ALIAS(arm64_const_caps_ready); KVM_NVHE_ALIAS(cpu_hwcap_keys); -KVM_NVHE_ALIAS(cpu_hwcaps); /* Static keys which are set if a vGIC trap should be handled in hyp. */ KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8f8fca47abfc..5750ec34960e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -808,6 +808,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) preempt_enable(); + /* + * The ARMv8 architecture doesn't give the hypervisor + * a mechanism to prevent a guest from dropping to AArch32 EL0 + * if implemented by the CPU. If we spot the guest in such + * state and that we decided it wasn't supposed to do so (like + * with the asymmetric AArch32 case), return to userspace with + * a fatal error. + */ + if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) { + /* + * As we have caught the guest red-handed, decide that + * it isn't fit for purpose anymore by making the vcpu + * invalid. The VMM can try and fix it by issuing a + * KVM_ARM_VCPU_INIT if it really wants to. + */ + vcpu->arch.target = -1; + ret = ARM_EXCEPTION_IL; + } + ret = handle_exit(vcpu, ret); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index ff9a0f547b9f..ed27f06a31ba 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -17,8 +17,6 @@ SYM_FUNC_START(__host_exit) get_host_ctxt x0, x1 - ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) - /* Store the host regs x2 and x3 */ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 47224dc62c51..b11a9d7db677 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -57,16 +57,25 @@ __do_hyp_init: cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - /* Set tpidr_el2 for use by HYP to free a register */ - msr tpidr_el2, x2 - - mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) - cmp x0, x2 - b.eq 1f + // We only actively check bits [24:31], and everything + // else has to be zero, which we check at build time. +#if (KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) & 0xFFFFFFFF00FFFFFF) +#error Unexpected __KVM_HOST_SMCCC_FUNC___kvm_hyp_init value +#endif + + ror x0, x0, #24 + eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 24) & 0xF) + ror x0, x0, #4 + eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 28) & 0xF) + cbz x0, 1f mov x0, #SMCCC_RET_NOT_SUPPORTED eret -1: phys_to_ttbr x0, x1 +1: + /* Set tpidr_el2 for use by HYP to free a register */ + msr tpidr_el2, x2 + + phys_to_ttbr x0, x1 alternative_if ARM64_HAS_CNP orr x0, x0, #TTBR_CNP_BIT alternative_else_nop_endif diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 39ca71ab8866..fbde89a2c6e8 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -128,7 +128,6 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0cdf6e461cbd..0271b4a3b9fe 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -635,7 +635,7 @@ static void stage2_flush_dcache(void *addr, u64 size) static bool stage2_pte_cacheable(kvm_pte_t pte) { - u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte); + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; return memattr == PAGE_S2_MEMATTR(NORMAL); } @@ -846,7 +846,7 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; - pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO); + pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!pgt->pgd) return -ENOMEM; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 19aacc7d64de..57972bdb213a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -787,14 +787,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_shift = PAGE_SHIFT; } - if (vma_shift == PUD_SHIFT && - !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) - vma_shift = PMD_SHIFT; - - if (vma_shift == PMD_SHIFT && - !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { - force_pte = true; + switch (vma_shift) { + case PUD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + break; + fallthrough; + case CONT_PMD_SHIFT: + vma_shift = PMD_SHIFT; + fallthrough; + case PMD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + break; + fallthrough; + case CONT_PTE_SHIFT: vma_shift = PAGE_SHIFT; + force_pte = true; + fallthrough; + case PAGE_SHIFT: + break; + default: + WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); } vma_pagesize = 1UL << vma_shift; @@ -839,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (kvm_is_device_pfn(pfn)) { device = true; + force_pte = true; } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 41348a7781d9..fb12d3ef423a 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1897,9 +1897,9 @@ static const struct sys_reg_desc cp14_regs[] = { { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(1), /* DBGDCCINT */ - { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 }, + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT }, /* DBGDSCRext */ - { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 }, + { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext }, DBG_BCR_BVR_WCR_WVR(2), /* DBGDTR[RT]Xint */ { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, @@ -1914,7 +1914,7 @@ static const struct sys_reg_desc cp14_regs[] = { { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(6), /* DBGVCR */ - { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 }, + { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR }, DBG_BCR_BVR_WCR_WVR(7), DBG_BCR_BVR_WCR_WVR(8), DBG_BCR_BVR_WCR_WVR(9), diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 812e9b4c1114..950afebfba88 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -32,6 +32,7 @@ #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_ASYNC_PF_INT 14 +#define KVM_FEATURE_MSI_EXT_DEST_ID 15 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17587f496ec7..1f96adff8dc4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; - gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) + gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) & shadow_nonpresent_or_rsvd_mask; return gpa >> PAGE_SHIFT; @@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 restore_acc_track_spte(u64 spte) { u64 new_spte = spte; - u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) - & shadow_acc_track_saved_bits_mask; + u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) + & SHADOW_ACC_TRACK_SAVED_BITS_MASK; WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; - new_spte &= ~(shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift); + new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); new_spte |= saved_bits; return new_spte; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index d9c5665a55e9..fcac2cac78fe 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) mask |= shadow_mmio_value | access; mask |= gpa | shadow_nonpresent_or_rsvd_mask; mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; + << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; return mask; } @@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte) !spte_can_locklessly_be_made_writable(spte), "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), + WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), "kvm: Access Tracking saved bit locations are not zero\n"); - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; + spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; spte &= ~shadow_acc_track_mask; return spte; @@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte) void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_access_mask = access_mask; @@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void) low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_has_bug(X86_BUG_L1TF) && !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { + 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) { low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; + - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; shadow_nonpresent_or_rsvd_mask = rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 4ecf40e0b8fe..5c75a451c000 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -105,19 +105,19 @@ extern u64 __read_mostly shadow_acc_track_mask; extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* + * The number of high-order 1 bits to use in the mask above. + */ +#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 + +/* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the * PTEs being access tracked also need to be dirty tracked, so the W bit will be * restored only when a write is attempted to the page. */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ + PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT /* * In some cases, we need to preserve the GFN of a non-present or reserved diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index e5325bd0f304..f3199bb02f22 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - } #endif diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index e5f7a7ebf27d..bd41d9462355 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } -void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; } static inline u32 evmcs_read32(unsigned long field) { return 0; } static inline u16 evmcs_read16(unsigned long field) { return 0; } static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d14c94d0aff1..47b8357b9751 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2560,8 +2560,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; - if (static_branch_unlikely(&enable_evmcs)) +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif return 0; } @@ -6834,7 +6836,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx; - unsigned long *msr_bitmap; int i, cpu, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); @@ -6894,7 +6895,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); - msr_bitmap = vmx->vmcs01.msr_bitmap; vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 397f599b20e5..f5ede41bf9e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -265,13 +265,13 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, if (ignore_msrs) { if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", - op, msr, data); + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", + op, msr, data); /* Mask the error */ return 0; } else { - vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", - op, msr, data); + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); return -ENOENT; } } diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 307ceaadbbb9..d2c2d6205008 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -15,6 +15,7 @@ /x86_64/vmx_preemption_timer_test /x86_64/svm_vmcall_test /x86_64/sync_regs_test +/x86_64/vmx_apic_access_test /x86_64/vmx_close_while_nested_test /x86_64/vmx_dirty_log_test /x86_64/vmx_set_nested_state_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7ebe71fbca53..30afbad36cd5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -49,6 +49,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 54d624dd6c10..e78d7e26ba61 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -573,6 +573,10 @@ struct vmx_pages { void *eptp_hva; uint64_t eptp_gpa; void *eptp; + + void *apic_access_hva; + uint64_t apic_access_gpa; + void *apic_access; }; union vmx_basic { @@ -615,5 +619,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t memslot, uint32_t eptp_memslot); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 74776ee228f2..3327cebc1095 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -14,6 +14,7 @@ #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include <unistd.h> #include <linux/kernel.h> #define KVM_UTIL_PGS_PER_HUGEPG 512 @@ -664,13 +665,21 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* As needed perform madvise */ if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { - ret = madvise(region->host_mem, npages * vm->page_size, - src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); - TEST_ASSERT(ret == 0, "madvise failed,\n" - " addr: %p\n" - " length: 0x%lx\n" - " src_type: %x", - region->host_mem, npages * vm->page_size, src_type); + struct stat statbuf; + + ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), + "stat /sys/kernel/mm/transparent_hugepage"); + + TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP, + "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel"); + + if (ret == 0) { + ret = madvise(region->host_mem, npages * vm->page_size, + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x", + region->host_mem, npages * vm->page_size, src_type); + } } region->unused_phy_pages = sparsebit_alloc(); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index f1e00d43eea2..2448b30e8efa 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -542,3 +542,12 @@ void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); } + +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot) +{ + vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); + vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c new file mode 100644 index 000000000000..1f65342d6cb7 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_apic_access_test + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * The first subtest simply checks to see that an L2 guest can be + * launched with a valid APIC-access address that is backed by a + * page of L1 physical memory. + * + * The second subtest sets the APIC-access address to a (valid) L1 + * physical address that is not backed by memory. KVM can't handle + * this situation, so resuming L2 should result in a KVM exit for + * internal error (emulation). This is not an architectural + * requirement. It is just a shortcoming of KVM. The internal error + * is unfortunate, but it's better than what used to happen! + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include <string.h> +#include <sys/ioctl.h> + +#include "kselftest.h" + +#define VCPU_ID 0 + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* Exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa); + + /* Try to launch L2 with the memory-backed APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + vmwrite(APIC_ACCESS_ADDR, high_gpa); + + /* Try to resume L2 with the unbacked APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + unsigned long apic_access_addr = ~0ul; + unsigned int paddr_width; + unsigned int vaddr_width; + vm_vaddr_t vmx_pages_gva; + unsigned long high_gpa; + struct vmx_pages *vmx; + bool done = false; + + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + kvm_get_cpu_address_width(&paddr_width, &vaddr_width); + high_gpa = (1ul << paddr_width) - getpagesize(); + if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) { + print_skip("No unbacked physical page available"); + exit(KSFT_SKIP); + } + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm, 0); + vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa); + + while (!done) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + if (apic_access_addr == high_gpa) { + TEST_ASSERT(run->exit_reason == + KVM_EXIT_INTERNAL_ERROR, + "Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->internal.suberror == + KVM_INTERNAL_ERROR_EMULATION, + "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n", + run->internal.suberror); + break; + } + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + apic_access_addr = uc.args[1]; + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd); + } + } + kvm_vm_free(vm); + return 0; +} |