From e47057151422a67ce08747176fa21cb3b526a2c9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 21 Jul 2017 13:57:14 +1000 Subject: KVM: PPC: Book3S HV: Enable TM before accessing TM registers Commit 46a704f8409f ("KVM: PPC: Book3S HV: Preserve userspace HTM state properly", 2017-06-15) added code to read transactional memory (TM) registers but forgot to enable TM before doing so. The result is that if userspace does have live values in the TM registers, a KVM_RUN ioctl will cause a host kernel crash like this: [ 181.328511] Unrecoverable TM Unavailable Exception f60 at d00000001e7d9980 [ 181.328605] Oops: Unrecoverable TM Unavailable Exception, sig: 6 [#1] [ 181.328613] SMP NR_CPUS=2048 [ 181.328613] NUMA [ 181.328618] PowerNV [ 181.328646] Modules linked in: vhost_net vhost tap nfs_layout_nfsv41_files rpcsec_gss_krb5 nfsv4 dns_resolver nfs +fscache xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat +nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun ebtable_filter ebtables +ip6table_filter ip6_tables iptable_filter bridge stp llc kvm_hv kvm nfsd ses enclosure scsi_transport_sas ghash_generic +auth_rpcgss gf128mul xts sg ctr nfs_acl lockd vmx_crypto shpchp ipmi_powernv i2c_opal grace ipmi_devintf i2c_core +powernv_rng sunrpc ipmi_msghandler ibmpowernv uio_pdrv_genirq uio leds_powernv powernv_op_panel ip_tables xfs sd_mod +lpfc ipr bnx2x libata mdio ptp pps_core scsi_transport_fc libcrc32c dm_mirror dm_region_hash dm_log dm_mod [ 181.329278] CPU: 40 PID: 9926 Comm: CPU 0/KVM Not tainted 4.12.0+ #1 [ 181.329337] task: c000003fc6980000 task.stack: c000003fe4d80000 [ 181.329396] NIP: d00000001e7d9980 LR: d00000001e77381c CTR: d00000001e7d98f0 [ 181.329465] REGS: c000003fe4d837e0 TRAP: 0f60 Not tainted (4.12.0+) [ 181.329523] MSR: 9000000000009033 [ 181.329527] CR: 24022448 XER: 00000000 [ 181.329608] CFAR: d00000001e773818 SOFTE: 1 [ 181.329608] GPR00: d00000001e77381c c000003fe4d83a60 d00000001e7ef410 c000003fdcfe0000 [ 181.329608] GPR04: c000003fe4f00000 0000000000000000 0000000000000000 c000003fd7954800 [ 181.329608] GPR08: 0000000000000001 c000003fc6980000 0000000000000000 d00000001e7e2880 [ 181.329608] GPR12: d00000001e7d98f0 c000000007b19000 00000001295220e0 00007fffc0ce2090 [ 181.329608] GPR16: 0000010011886608 00007fff8c89f260 0000000000000001 00007fff8c080028 [ 181.329608] GPR20: 0000000000000000 00000100118500a6 0000010011850000 0000010011850000 [ 181.329608] GPR24: 00007fffc0ce1b48 0000010011850000 00000000d673b901 0000000000000000 [ 181.329608] GPR28: 0000000000000000 c000003fdcfe0000 c000003fdcfe0000 c000003fe4f00000 [ 181.330199] NIP [d00000001e7d9980] kvmppc_vcpu_run_hv+0x90/0x6b0 [kvm_hv] [ 181.330264] LR [d00000001e77381c] kvmppc_vcpu_run+0x2c/0x40 [kvm] [ 181.330322] Call Trace: [ 181.330351] [c000003fe4d83a60] [d00000001e773478] kvmppc_set_one_reg+0x48/0x340 [kvm] (unreliable) [ 181.330437] [c000003fe4d83b30] [d00000001e77381c] kvmppc_vcpu_run+0x2c/0x40 [kvm] [ 181.330513] [c000003fe4d83b50] [d00000001e7700b4] kvm_arch_vcpu_ioctl_run+0x114/0x2a0 [kvm] [ 181.330586] [c000003fe4d83bd0] [d00000001e7642f8] kvm_vcpu_ioctl+0x598/0x7a0 [kvm] [ 181.330658] [c000003fe4d83d40] [c0000000003451b8] do_vfs_ioctl+0xc8/0x8b0 [ 181.330717] [c000003fe4d83de0] [c000000000345a64] SyS_ioctl+0xc4/0x120 [ 181.330776] [c000003fe4d83e30] [c00000000000b004] system_call+0x58/0x6c [ 181.330833] Instruction dump: [ 181.330869] e92d0260 e9290b50 e9290108 792807e3 41820058 e92d0260 e9290b50 e9290108 [ 181.330941] 792ae8a4 794a1f87 408204f4 e92d0260 <7d4022a6> f9490ff0 e92d0260 7d4122a6 [ 181.331013] ---[ end trace 6f6ddeb4bfe92a92 ]--- The fix is just to turn on the TM bit in the MSR before accessing the registers. Cc: stable@vger.kernel.org # v3.14+ Fixes: 46a704f8409f ("KVM: PPC: Book3S HV: Preserve userspace HTM state properly") Reported-by: Jan Stancek Tested-by: Jan Stancek Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0b436df746fc..359c79cdf0cc 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3211,6 +3211,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) run->fail_entry.hardware_entry_failure_reason = 0; return -EINVAL; } + /* Enable TM so we can read the TM SPRs */ + mtmsr(mfmsr() | MSR_TM); current->thread.tm_tfhar = mfspr(SPRN_TFHAR); current->thread.tm_tfiar = mfspr(SPRN_TFIAR); current->thread.tm_texasr = mfspr(SPRN_TEXASR); -- cgit v1.2.3 From ef42719814db06fdfa26cd7566de0b64de173320 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 21 Jul 2017 15:41:49 +1000 Subject: KVM: PPC: Book3S HV: Fix host crash on changing HPT size Commit f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size", 2016-12-20) changed the behaviour of the KVM_PPC_ALLOCATE_HTAB ioctl so that it now allocates a new HPT and new revmap array if there was a previously-allocated HPT of a different size from the size being requested. In this case, we need to reset the rmap arrays of the memslots, because the rmap arrays will contain references to HPTEs which are no longer valid. Worse, these references are also references to slots in the new revmap array (which parallels the HPT), and the new revmap array contains random contents, since it doesn't get zeroed on allocation. The effect of having these stale references to slots in the revmap array that contain random contents is that subsequent calls to functions such as kvmppc_add_revmap_chain will crash because they will interpret the non-zero contents of the revmap array as HPTE indexes and thus index outside of the revmap array. This leads to host crashes such as the following. [ 7072.862122] Unable to handle kernel paging request for data at address 0xd000000c250c00f8 [ 7072.862218] Faulting instruction address: 0xc0000000000e1c78 [ 7072.862233] Oops: Kernel access of bad area, sig: 11 [#1] [ 7072.862286] SMP NR_CPUS=1024 [ 7072.862286] NUMA [ 7072.862325] PowerNV [ 7072.862378] Modules linked in: kvm_hv vhost_net vhost tap xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm iw_cxgb3 mlx5_ib ib_core ses enclosure scsi_transport_sas ipmi_powernv ipmi_devintf ipmi_msghandler powernv_op_panel i2c_opal nfsd auth_rpcgss oid_registry [ 7072.863085] nfs_acl lockd grace sunrpc kvm_pr kvm xfs libcrc32c scsi_dh_alua dm_service_time radeon lpfc nvme_fc nvme_fabrics nvme_core scsi_transport_fc i2c_algo_bit tg3 drm_kms_helper ptp pps_core syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm dm_multipath i2c_core cxgb3 mlx5_core mdio [last unloaded: kvm_hv] [ 7072.863381] CPU: 72 PID: 56929 Comm: qemu-system-ppc Not tainted 4.12.0-kvm+ #59 [ 7072.863457] task: c000000fe29e7600 task.stack: c000001e3ffec000 [ 7072.863520] NIP: c0000000000e1c78 LR: c0000000000e2e3c CTR: c0000000000e25f0 [ 7072.863596] REGS: c000001e3ffef560 TRAP: 0300 Not tainted (4.12.0-kvm+) [ 7072.863658] MSR: 9000000100009033 [ 7072.863667] CR: 44082882 XER: 20000000 [ 7072.863767] CFAR: c0000000000e2e38 DAR: d000000c250c00f8 DSISR: 42000000 SOFTE: 1 GPR00: c0000000000e2e3c c000001e3ffef7e0 c000000001407d00 d000000c250c00f0 GPR04: d00000006509fb70 d00000000b3d2048 0000000003ffdfb7 0000000000000000 GPR08: 00000001007fdfb7 00000000c000000f d0000000250c0000 000000000070f7bf GPR12: 0000000000000008 c00000000fdad000 0000000010879478 00000000105a0d78 GPR16: 00007ffaf4080000 0000000000001190 0000000000000000 0000000000010000 GPR20: 4001ffffff000415 d00000006509fb70 0000000004091190 0000000ee1881190 GPR24: 0000000003ffdfb7 0000000003ffdfb7 00000000007fdfb7 c000000f5c958000 GPR28: d00000002d09fb70 0000000003ffdfb7 d00000006509fb70 d00000000b3d2048 [ 7072.864439] NIP [c0000000000e1c78] kvmppc_add_revmap_chain+0x88/0x130 [ 7072.864503] LR [c0000000000e2e3c] kvmppc_do_h_enter+0x84c/0x9e0 [ 7072.864566] Call Trace: [ 7072.864594] [c000001e3ffef7e0] [c000001e3ffef830] 0xc000001e3ffef830 (unreliable) [ 7072.864671] [c000001e3ffef830] [c0000000000e2e3c] kvmppc_do_h_enter+0x84c/0x9e0 [ 7072.864751] [c000001e3ffef920] [d00000000b38d878] kvmppc_map_vrma+0x168/0x200 [kvm_hv] [ 7072.864831] [c000001e3ffef9e0] [d00000000b38a684] kvmppc_vcpu_run_hv+0x1284/0x1300 [kvm_hv] [ 7072.864914] [c000001e3ffefb30] [d00000000f465664] kvmppc_vcpu_run+0x44/0x60 [kvm] [ 7072.865008] [c000001e3ffefb60] [d00000000f461864] kvm_arch_vcpu_ioctl_run+0x114/0x290 [kvm] [ 7072.865152] [c000001e3ffefbe0] [d00000000f453c98] kvm_vcpu_ioctl+0x598/0x7a0 [kvm] [ 7072.865292] [c000001e3ffefd40] [c000000000389328] do_vfs_ioctl+0xd8/0x8c0 [ 7072.865410] [c000001e3ffefde0] [c000000000389be4] SyS_ioctl+0xd4/0x130 [ 7072.865526] [c000001e3ffefe30] [c00000000000b760] system_call+0x58/0x6c [ 7072.865644] Instruction dump: [ 7072.865715] e95b2110 793a0020 7b4926e4 7f8a4a14 409e0098 807c000c 786326e4 7c6a1a14 [ 7072.865857] 935e0008 7bbd0020 813c000c 913e000c <93a30008> 93bc000c 48000038 60000000 [ 7072.866001] ---[ end trace 627b6e4bf8080edc ]--- Note that to trigger this, it is necessary to use a recent upstream QEMU (or other userspace that resizes the HPT at CAS time), specify a maximum memory size substantially larger than the current memory size, and boot a guest kernel that does not support HPT resizing. This fixes the problem by resetting the rmap arrays when the old HPT is freed. Fixes: f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size") Cc: stable@vger.kernel.org # v4.11+ Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cb0190e2a73..b42812e014c0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -164,8 +164,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) goto out; } - if (kvm->arch.hpt.virt) + if (kvm->arch.hpt.virt) { kvmppc_free_hpt(&kvm->arch.hpt); + kvmppc_rmap_reset(kvm); + } err = kvmppc_allocate_hpt(&info, order); if (err < 0) -- cgit v1.2.3 From fa19871a166f6a940eb97dd511d3ddb1841ed95a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 14 Jul 2017 13:38:28 +0200 Subject: KVM: VMX: remove unused field Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29fd8af5c347..d04092f821b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -563,7 +563,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; -- cgit v1.2.3 From 4f899147424a189b0ad1fdd6f35784ed5a642e83 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 10 Jul 2017 13:35:48 +0200 Subject: KVM: s390: take srcu lock when getting/setting storage keys The following warning was triggered by missing srcu locks around the storage key handling functions. ============================= WARNING: suspicious RCU usage 4.12.0+ #56 Not tainted ----------------------------- ./include/linux/kvm_host.h:572 suspicious rcu_dereference_check() usage! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by live_migration/4936: #0: (&mm->mmap_sem){++++++}, at: [<0000000000141be0>] kvm_arch_vm_ioctl+0x6b8/0x22d0 CPU: 8 PID: 4936 Comm: live_migration Not tainted 4.12.0+ #56 Hardware name: IBM 2964 NC9 704 (LPAR) Call Trace: ([<000000000011378a>] show_stack+0xea/0xf0) [<000000000055cc4c>] dump_stack+0x94/0xd8 [<000000000012ee70>] gfn_to_memslot+0x1a0/0x1b8 [<0000000000130b76>] gfn_to_hva+0x2e/0x48 [<0000000000141c3c>] kvm_arch_vm_ioctl+0x714/0x22d0 [<000000000013306c>] kvm_vm_ioctl+0x11c/0x7b8 [<000000000037e2c0>] do_vfs_ioctl+0xa8/0x6c8 [<000000000037e984>] SyS_ioctl+0xa4/0xb8 [<00000000008b20a4>] system_call+0xc4/0x27c 1 lock held by live_migration/4936: #0: (&mm->mmap_sem){++++++}, at: [<0000000000141be0>] kvm_arch_vm_ioctl+0x6b8/0x22d0 Signed-off-by: Christian Borntraeger Reviewed-by: Pierre Morel --- arch/s390/kvm/kvm-s390.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f2884e99ed4..af09d3437631 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1324,7 +1324,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; - int i, r = 0; + int srcu_idx, i, r = 0; if (args->flags != 0) return -EINVAL; @@ -1342,6 +1342,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) return -ENOMEM; down_read(¤t->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { @@ -1353,6 +1354,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) break; } + srcu_read_unlock(&kvm->srcu, srcu_idx); up_read(¤t->mm->mmap_sem); if (!r) { @@ -1370,7 +1372,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; - int i, r = 0; + int srcu_idx, i, r = 0; if (args->flags != 0) return -EINVAL; @@ -1396,6 +1398,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) goto out; down_read(¤t->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { @@ -1413,6 +1416,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) break; } + srcu_read_unlock(&kvm->srcu, srcu_idx); up_read(¤t->mm->mmap_sem); out: kvfree(keys); -- cgit v1.2.3 From a512177ef3bb92dbec8a96fe337b11c126bf9c91 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jul 2017 18:54:38 +0200 Subject: KVM: x86: do mask out upper bits of PAE CR3 This reverts the change of commit f85c758dbee54cc3612a6e873ef7cecdb66ebee5, as the behavior it modified was intended. The VM is running in 32-bit PAE mode, and Table 4-7 of the Intel manual says: Table 4-7. Use of CR3 with PAE Paging Bit Position(s) Contents 4:0 Ignored 31:5 Physical address of the 32-Byte aligned page-directory-pointer table used for linear-address translation 63:32 Ignored (these bits exist only on processors supporting the Intel-64 architecture) To placate the static checker, write the mask explicitly as an unsigned long constant instead of using a 32-bit unsigned constant. Cc: Dan Carpenter Fixes: f85c758dbee54cc3612a6e873ef7cecdb66ebee5 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82a63c59f77b..6c97c82814c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -597,8 +597,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31ul) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31ul) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) -- cgit v1.2.3 From 210f84b0ca7743f3b2a9acfae81df668dbbb6a12 Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Fri, 28 Apr 2017 13:13:58 +0800 Subject: x86: irq: Define a global vector for nested posted interrupts We are using the same vector for nested/non-nested posted interrupts delivery, this may cause interrupts latency in L1 since we can't kick the L2 vcpu out of vmx-nonroot mode. This patch introduces a new vector which is only for nested posted interrupts to solve the problems above. Signed-off-by: Wincy Van Signed-off-by: Paolo Bonzini --- arch/x86/entry/entry_64.S | 1 + arch/x86/include/asm/entry_arch.h | 2 ++ arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 2 ++ arch/x86/include/asm/irq_vectors.h | 3 ++- arch/x86/kernel/irq.c | 19 +++++++++++++++++++ arch/x86/kernel/irqinit.c | 2 ++ 7 files changed, 29 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a9a8027a6c0e..d271fb79248f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -705,6 +705,7 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index df002992d8fd..07b06955a05d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -25,6 +25,8 @@ BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, smp_kvm_posted_intr_wakeup_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, + smp_kvm_posted_intr_nested_ipi) #endif /* diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 9b76cd331990..ad1ed531febc 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -15,6 +15,7 @@ typedef struct { #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; + unsigned int kvm_posted_intr_nested_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b90e1053049b..d6dbafbd4207 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); +extern asmlinkage void kvm_posted_intr_nested_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -62,6 +63,7 @@ extern void trace_call_function_single_interrupt(void); #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi +#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi #endif /* CONFIG_TRACING */ #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6ca9fd6234e1..aaf8d28b5d00 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -83,7 +83,6 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ @@ -98,6 +97,8 @@ /* Vector for KVM to deliver posted interrupt IPI */ #ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 +#define POSTED_INTR_NESTED_VECTOR 0xf0 #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4aa03c5a14c9..4ed0aba8dbc8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -155,6 +155,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); seq_puts(p, " Posted-interrupt notification event\n"); + seq_printf(p, "%*s: ", prec, "NPI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_nested_ipis); + seq_puts(p, " Nested posted-interrupt event\n"); + seq_printf(p, "%*s: ", prec, "PIW"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -313,6 +319,19 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } + +/* + * Handler for POSTED_INTERRUPT_NESTED_VECTOR. + */ +__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_nested_ipis); + exiting_irq(); + set_irq_regs(old_regs); +} #endif __visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7468c6987547..c7fd18526c3e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -150,6 +150,8 @@ static void __init apic_intr_init(void) alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); /* IPI for KVM to deliver interrupt to wake up tasks */ alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); + /* IPI for KVM to deliver nested posted interrupt */ + alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ -- cgit v1.2.3 From 06a5524f091b6b41b0007810c74cc9315923a145 Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Fri, 28 Apr 2017 13:13:59 +0800 Subject: KVM: nVMX: Fix posted intr delivery when vcpu is in guest mode The PI vector for L0 and L1 must be different. If dest vcpu0 is in guest mode while vcpu1 is delivering a non-nested PI to vcpu0, there wont't be any vmexit so that the non-nested interrupt will be delayed. Signed-off-by: Wincy Van Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d04092f821b6..497cf64794fd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4987,9 +4987,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + bool nested) { #ifdef CONFIG_SMP + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; + if (vcpu->mode == IN_GUEST_MODE) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5007,8 +5010,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; } #endif @@ -5023,7 +5025,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. @@ -5057,7 +5059,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); } @@ -10064,13 +10066,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10941,7 +10939,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu); } - + /* Restore posted intr vector. */ + if (nested_cpu_has_posted_intr(vmcs12)) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); -- cgit v1.2.3 From 2d6144e366fb39609aecf7a658e2e10af37627e9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 25 Jul 2017 03:40:46 -0700 Subject: KVM: nVMX: Fix loss of L2's NMI blocking state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run kvm-unit-tests/eventinj.flat in L1 w/ ept=0 on both L0 and L1: Before NMI IRET test Sending NMI to self NMI isr running stack 0x461000 Sending nested NMI to self After nested NMI to self Nested NMI isr running rip=40038e After iret After NMI to self FAIL: NMI Commit 4c4a6f790ee862 (KVM: nVMX: track NMI blocking state separately for each VMCS) tracks NMI blocking state separately for vmcs01 and vmcs02. However it is not enough: - The L2 (kvm-unit-tests/eventinj.flat) generates NMI that will fault on IRET, so the L2 can generate #PF which can be intercepted by L0. - L0 walks L1's guest page table and sees the mapping is invalid, it resumes the L1 guest and injects the #PF into L1. At this point the vmcs02 has nmi_known_unmasked=true. - L1 sets set bit 3 (blocking by NMI) in the interruptibility-state field of vmcs12 (and fixes the shadow page table) before resuming L2 guest. - L1 executes VMRESUME to resume L2, causing a vmexit to L0 - during VMRESUME emulation, prepare_vmcs02 sets bit 3 in the interruptibility-state field of vmcs02, but nmi_known_unmasked is still true. - L2 immediately exits to L0 with another page fault, because L0 still has not updated the NGVA->HPA page tables. However, nmi_known_unmasked is true so vmx_recover_nmi_blocking does not do anything. The fix is to update nmi_known_unmasked when preparing vmcs02 from vmcs12. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 497cf64794fd..39a6222bf968 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10042,6 +10042,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } -- cgit v1.2.3 From 1d518c6820daf4e00d29adfba980aee05f605f0f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 25 Jul 2017 00:43:15 -0700 Subject: KVM: LAPIC: Fix reentrancy issues with preempt notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preempt can occur in the preemption timer expiration handler: CPU0 CPU1 preemption timer vmexit handle_preemption_timer(vCPU0) kvm_lapic_expired_hv_timer hv_timer_is_use == true sched_out sched_in kvm_arch_vcpu_load kvm_lapic_restart_hv_timer restart_apic_timer start_hv_timer already-expired timer or sw timer triggerd in the window start_sw_timer cancel_hv_timer /* back in kvm_lapic_expired_hv_timer */ cancel_hv_timer WARN_ON(!apic->lapic_timer.hv_timer_in_use); ==> Oops This can be reproduced if CONFIG_PREEMPT is enabled. ------------[ cut here ]------------ WARNING: CPU: 4 PID: 2972 at /home/kernel/linux/arch/x86/kvm//lapic.c:1563 kvm_lapic_expired_hv_timer+0x9e/0xb0 [kvm] CPU: 4 PID: 2972 Comm: qemu-system-x86 Tainted: G OE 4.13.0-rc2+ #16 RIP: 0010:kvm_lapic_expired_hv_timer+0x9e/0xb0 [kvm] Call Trace: handle_preemption_timer+0xe/0x20 [kvm_intel] vmx_handle_exit+0xb8/0xd70 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xdd1/0x1be0 [kvm] ? kvm_arch_vcpu_load+0x47/0x230 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x81/0x220 entry_SYSCALL64_slow_path+0x25/0x25 ------------[ cut here ]------------ WARNING: CPU: 4 PID: 2972 at /home/kernel/linux/arch/x86/kvm//lapic.c:1498 cancel_hv_timer.isra.40+0x4f/0x60 [kvm] CPU: 4 PID: 2972 Comm: qemu-system-x86 Tainted: G W OE 4.13.0-rc2+ #16 RIP: 0010:cancel_hv_timer.isra.40+0x4f/0x60 [kvm] Call Trace: kvm_lapic_expired_hv_timer+0x3e/0xb0 [kvm] handle_preemption_timer+0xe/0x20 [kvm_intel] vmx_handle_exit+0xb8/0xd70 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xdd1/0x1be0 [kvm] ? kvm_arch_vcpu_load+0x47/0x230 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x81/0x220 entry_SYSCALL64_slow_path+0x25/0x25 This patch fixes it by making the caller of cancel_hv_timer, start_hv_timer and start_sw_timer be in preemption-disabled regions, which trivially avoid any reentrancy issue with preempt notifier. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Add more WARNs. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2819d4c123eb..589dcc117086 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,11 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; - preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) @@ -1507,6 +1506,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) struct kvm_timer *ktimer = &apic->lapic_timer; int r; + WARN_ON(preemptible()); if (!kvm_x86_ops->set_hv_timer) return false; @@ -1538,6 +1538,8 @@ static bool start_hv_timer(struct kvm_lapic *apic) static void start_sw_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; + + WARN_ON(preemptible()); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) @@ -1552,15 +1554,20 @@ static void start_sw_timer(struct kvm_lapic *apic) static void restart_apic_timer(struct kvm_lapic *apic) { + preempt_disable(); if (!start_hv_timer(apic)) start_sw_timer(apic); + preempt_enable(); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - WARN_ON(!apic->lapic_timer.hv_timer_in_use); + preempt_disable(); + /* If the preempt notifier has already run, it also called apic_timer_expired */ + if (!apic->lapic_timer.hv_timer_in_use) + goto out; WARN_ON(swait_active(&vcpu->wq)); cancel_hv_timer(apic); apic_timer_expired(apic); @@ -1569,6 +1576,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) advance_periodic_target_expiration(apic); restart_apic_timer(apic); } +out: + preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); @@ -1582,9 +1591,11 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ if (apic->lapic_timer.hv_timer_in_use) start_sw_timer(apic); + preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); -- cgit v1.2.3