diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-20 10:20:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-20 10:20:27 -0700 |
commit | 07ab9d5bc53d7fe84047be1d403566123ab9cfaa (patch) | |
tree | f6bec8b0dc2480521b21ad3c567439a9535460c4 /arch/x86/kvm | |
parent | f65420df914a85e33b2c8b1cab310858b2abb7c0 (diff) | |
parent | 30cd8604323dbaf20a80e797fe7057f5b02e394d (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini:
"Mostly bugfixes, but also:
- s390 support for KVM selftests
- LAPIC timer offloading to housekeeping CPUs
- Extend an s390 optimization for overcommitted hosts to all
architectures
- Debugging cleanups and improvements"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (25 commits)
KVM: x86: Add fixed counters to PMU filter
KVM: nVMX: do not use dangling shadow VMCS after guest reset
KVM: VMX: dump VMCS on failed entry
KVM: x86/vPMU: refine kvm_pmu err msg when event creation failed
KVM: s390: Use kvm_vcpu_wake_up in kvm_s390_vcpu_wakeup
KVM: Boost vCPUs that are delivering interrupts
KVM: selftests: Remove superfluous define from vmx.c
KVM: SVM: Fix detection of AMD Errata 1096
KVM: LAPIC: Inject timer interrupt via posted interrupt
KVM: LAPIC: Make lapic timer unpinned
KVM: x86/vPMU: reset pmc->counter to 0 for pmu fixed_counters
KVM: nVMX: Ignore segment base for VMX memory operand when segment not FS or GS
kvm: x86: ioapic and apic debug macros cleanup
kvm: x86: some tsc debug cleanup
kvm: vmx: fix coccinelle warnings
x86: kvm: avoid constant-conversion warning
x86: kvm: avoid -Wsometimes-uninitized warning
KVM: x86: expose AVX512_BF16 feature to guest
KVM: selftests: enable pgste option for the linker on s390
KVM: selftests: Move kvm_create_max_vcpus test to generic code
...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 15 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 202 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 42 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 13 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
13 files changed, 189 insertions, 188 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ead681210306..22c2720cd948 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -368,9 +368,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR); + /* cpuid 7.1.eax */ + const u32 kvm_cpuid_7_1_eax_x86_features = + F(AVX512_BF16); + switch (index) { case 0: - entry->eax = 0; + entry->eax = min(entry->eax, 1u); entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_7_0_EBX); /* TSC_ADJUST is emulated */ @@ -394,6 +398,12 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) */ entry->edx |= F(ARCH_CAPABILITIES); break; + case 1: + entry->eax &= kvm_cpuid_7_1_eax_x86_features; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; default: WARN_ON_ONCE(1); entry->eax = 0; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a39e38f13029..c10a8b10b203 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1594,7 +1594,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; uint16_t code, rep_idx, rep_cnt; - bool fast, longmode, rep; + bool fast, rep; /* * hypercall generates UD from non zero cpl and real mode @@ -1605,9 +1605,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { +#ifdef CONFIG_X86_64 + if (is_64_bit_mode(vcpu)) { + param = kvm_rcx_read(vcpu); + ingpa = kvm_rdx_read(vcpu); + outgpa = kvm_r8_read(vcpu); + } else +#endif + { param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | @@ -1615,13 +1620,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } -#ifdef CONFIG_X86_64 - else { - param = kvm_rcx_read(vcpu); - ingpa = kvm_rdx_read(vcpu); - outgpa = kvm_r8_read(vcpu); - } -#endif code = param & 0xffff; fast = !!(param & HV_HYPERCALL_FAST_BIT); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 1add1bc881e2..d859ae8890d0 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -45,11 +45,6 @@ #include "lapic.h" #include "irq.h" -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) -#else -#define ioapic_debug(fmt, arg...) -#endif static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); @@ -294,7 +289,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) default: index = (ioapic->ioregsel - 0x10) >> 1; - ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; e = &ioapic->redirtbl[index]; @@ -343,12 +337,6 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) entry->fields.remote_irr)) return -1; - ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " - "vector=%x trig_mode=%x\n", - entry->fields.dest_id, entry->fields.dest_mode, - entry->fields.delivery_mode, entry->fields.vector, - entry->fields.trig_mode); - irqe.dest_id = entry->fields.dest_id; irqe.vector = entry->fields.vector; irqe.dest_mode = entry->fields.dest_mode; @@ -515,7 +503,6 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -558,8 +545,6 @@ static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", - (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ switch (len) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a232e76d8f23..0aa158657f20 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -52,9 +52,6 @@ #define PRIu64 "u" #define PRIo64 "o" -/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) do {} while (0) - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) @@ -121,6 +118,17 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) return apic->vcpu->vcpu_id; } +bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt); + +static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -627,7 +635,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) { u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) - apic_debug("Can't read EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return val & 0x1; } @@ -635,7 +643,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - apic_debug("Can't set EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } @@ -645,7 +653,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - apic_debug("Can't clear EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } @@ -679,9 +687,6 @@ static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) else ppr = isrv & 0xf0; - apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", - apic, ppr, isr, isrv); - *new_ppr = ppr; if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); @@ -758,8 +763,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) return ((logical_id >> 4) == (mda >> 4)) && (logical_id & mda & 0xf) != 0; default: - apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR)); return false; } } @@ -798,10 +801,6 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); - apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x\n", - target, source, dest, dest_mode, short_hand); - ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: @@ -816,8 +815,6 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, case APIC_DEST_ALLBUT: return target != source; default: - apic_debug("kvm: apic: Bad dest shorthand value %x\n", - short_hand); return false; } } @@ -1095,15 +1092,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); - } else { - apic_debug("Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); } break; case APIC_DM_STARTUP: - apic_debug("SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ @@ -1221,14 +1213,6 @@ static void apic_send_ipi(struct kvm_lapic *apic) trace_kvm_apic_ipi(icr_low, irq.dest_id); - apic_debug("icr_high 0x%x, icr_low 0x%x, " - "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " - "msi_redir_hint 0x%x\n", - icr_high, icr_low, irq.shorthand, irq.dest_id, - irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector, irq.msi_redir_hint); - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } @@ -1282,7 +1266,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) switch (offset) { case APIC_ARBPRI: - apic_debug("Access APIC ARBPRI register which is for P6\n"); break; case APIC_TMCCT: /* Timer CCR */ @@ -1349,11 +1332,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); - if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) { - apic_debug("KVM_APIC_READ: read reserved register %x\n", - offset); + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) return 1; - } result = __apic_read(apic, offset & ~0xf); @@ -1411,9 +1391,6 @@ static void update_divide_count(struct kvm_lapic *apic) tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); - - apic_debug("timer divide count is 0x%x\n", - apic->divide_count); } static void limit_periodic_timer_frequency(struct kvm_lapic *apic) @@ -1455,29 +1432,6 @@ static void apic_update_lvtt(struct kvm_lapic *apic) } } -static void apic_timer_expired(struct kvm_lapic *apic) -{ - struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; - struct kvm_timer *ktimer = &apic->lapic_timer; - - if (atomic_read(&apic->lapic_timer.pending)) - return; - - atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); - - /* - * For x86, the atomic_inc() is serialized, thus - * using swait_active() is safe. - */ - if (swait_active(q)) - swake_up_one(q); - - if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) - ktimer->expired_tscdeadline = ktimer->tscdeadline; -} - /* * On APICv, this test will cause a busy wait * during a higher-priority task. @@ -1551,7 +1505,7 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, apic->lapic_timer.timer_advance_ns = timer_advance_ns; } -void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) +static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; @@ -1559,9 +1513,6 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (apic->lapic_timer.expired_tscdeadline == 0) return; - if (!lapic_timer_int_injected(vcpu)) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1573,8 +1524,57 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } + +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + if (lapic_timer_int_injected(vcpu)) + __kvm_wait_lapic_expire(vcpu); +} EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); +static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + + kvm_apic_local_deliver(apic, APIC_LVTT); + if (apic_lvtt_tscdeadline(apic)) + ktimer->tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + ktimer->tscdeadline = 0; + ktimer->target_expiration = 0; + } +} + +static void apic_timer_expired(struct kvm_lapic *apic) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct swait_queue_head *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) + ktimer->expired_tscdeadline = ktimer->tscdeadline; + + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { + if (apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } + + atomic_inc(&apic->lapic_timer.pending); + kvm_set_pending_timer(vcpu); + + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ + if (swait_active(q)) + swake_up_one(q); +} + static void start_sw_tscdeadline(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; @@ -1601,7 +1601,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); - hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1648,16 +1648,6 @@ static bool set_target_expiration(struct kvm_lapic *apic) limit_periodic_timer_frequency(apic); - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); @@ -1703,7 +1693,7 @@ static void start_sw_period(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) @@ -1860,8 +1850,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) { apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode; if (lvt0_in_nmi_mode) { - apic_debug("Receive NMI setting on APIC_LVT0 " - "for cpu %d\n", apic->vcpu->vcpu_id); atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } else atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); @@ -1975,8 +1963,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - if (val & 4) - apic_debug("KVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); if (apic->divide_count != old_divisor && @@ -1988,10 +1974,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; } case APIC_ESR: - if (apic_x2apic_mode(apic) && val != 0) { - apic_debug("KVM_WRITE:ESR not zero %x\n", val); + if (apic_x2apic_mode(apic) && val != 0) ret = 1; - } break; case APIC_SELF_IPI: @@ -2004,8 +1988,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) ret = 1; break; } - if (ret) - apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; } EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); @@ -2033,20 +2016,12 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, * 32/64/128 bits registers must be accessed thru 32 bits. * Refer SDM 8.4.1 */ - if (len != 4 || (offset & 0xf)) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + if (len != 4 || (offset & 0xf)) return 0; - } val = *(u32*)data; - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); - - kvm_lapic_reg_write(apic, offset, val); + kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } @@ -2178,11 +2153,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) pr_warn_once("APIC base relocation is unsupported by KVM"); - - /* with FSB delivery interrupt, we can restart APIC functionality */ - apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); - } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -2193,8 +2163,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!apic) return; - apic_debug("%s\n", __func__); - /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); @@ -2247,11 +2215,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - - apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" - "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_lapic_get_reg(apic, APIC_ID), - vcpu->arch.apic_base, apic->base_address); } /* @@ -2323,7 +2286,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) struct kvm_lapic *apic; ASSERT(vcpu != NULL); - apic_debug("apic_init %d\n", vcpu->vcpu_id); apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) @@ -2340,7 +2302,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; @@ -2397,13 +2359,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; if (atomic_read(&apic->lapic_timer.pending) > 0) { - kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) - apic->lapic_timer.tscdeadline = 0; - if (apic_lvtt_oneshot(apic)) { - apic->lapic_timer.tscdeadline = 0; - apic->lapic_timer.target_expiration = 0; - } + kvm_apic_inject_pending_timer_irqs(apic); atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2525,12 +2481,13 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; - if (!lapic_in_kernel(vcpu)) + if (!lapic_in_kernel(vcpu) || + kvm_can_post_timer_interrupt(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* @@ -2678,11 +2635,8 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_DFR || reg == APIC_ICR2) { - apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", - reg); + if (reg == APIC_DFR || reg == APIC_ICR2) return 1; - } if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; @@ -2780,8 +2734,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - apic_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 36747174e4a8..50053d2b8b7b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -236,6 +236,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9a5814d8d194..8f72526e2f68 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4597,11 +4597,11 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, */ /* Faults from writes to non-writable pages */ - u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; + u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ - u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; + u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ - u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; + u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index aa5a2597305a..46875bbd0419 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -19,8 +19,8 @@ #include "lapic.h" #include "pmu.h" -/* This keeps the total size of the filter under 4k. */ -#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63 +/* This is enough to filter the vast majority of currently defined events. */ +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; @@ -131,8 +131,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { - printk_once("kvm_pmu: event creation failed %ld\n", - PTR_ERR(event)); + pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", + PTR_ERR(event), pmc->idx); return; } @@ -206,12 +206,24 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; + struct kvm_pmu_event_filter *filter; + struct kvm *kvm = pmc->vcpu->kvm; pmc_stop_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (filter) { + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + return; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + return; + } + pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ @@ -385,6 +397,9 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; + if (tmp.flags != 0) + return -EINVAL; + if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; @@ -406,8 +421,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); - r = 0; + r = 0; cleanup: kfree(filter); - return r; + return r; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 583b9fa656f3..19f69df96758 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7128,13 +7128,41 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { - bool is_user, smap; - - is_user = svm_get_cpl(vcpu) == 3; - smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + unsigned long cr4 = kvm_read_cr4(vcpu); + bool smep = cr4 & X86_CR4_SMEP; + bool smap = cr4 & X86_CR4_SMAP; + bool is_user = svm_get_cpl(vcpu) == 3; /* - * Detect and workaround Errata 1096 Fam_17h_00_0Fh + * Detect and workaround Errata 1096 Fam_17h_00_0Fh. + * + * Errata: + * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail + * to read bytes of instruction which caused #NPF. In this case, + * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly + * return 0 instead of the correct guest instruction bytes. + * + * This happens because CPU microcode reading instruction bytes + * uses a special opcode which attempts to read data using CPL=0 + * priviledges. The microcode reads CS:RIP and if it hits a SMAP + * fault, it gives up and returns no instruction bytes. + * + * Detection: + * We reach here in case CPU supports DecodeAssist, raised #NPF and + * returned 0 in GuestIntrBytes field of the VMCB. + * First, errata can only be triggered in case vCPU CR4.SMAP=1. + * Second, if vCPU CR4.SMEP=1, errata could only be triggered + * in case vCPU CPL==3 (Because otherwise guest would have triggered + * a SMEP fault instead of #NPF). + * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. + * As most guests enable SMAP if they have also enabled SMEP, use above + * logic in order to attempt minimize false-positive of detecting errata + * while still preserving all cases semantic correctness. + * + * Workaround: + * To determine what instruction the guest was executing, the hypervisor + * will have to decode the instruction at the instruction pointer. * * In non SEV guest, hypervisor will be able to read the guest * memory to decode the instruction pointer when insn_len is zero @@ -7145,11 +7173,11 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ - if (is_user && smap) { + if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; - pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb509c254939..0f1378789bd0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -194,6 +194,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx->nested.need_vmcs12_to_shadow_sync = false; } static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) @@ -1341,6 +1342,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long val; int i; + if (WARN_ON(!shadow_vmcs)) + return; + preempt_disable(); vmcs_load(shadow_vmcs); @@ -1373,6 +1377,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) unsigned long val; int i, q; + if (WARN_ON(!shadow_vmcs)) + return; + vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { @@ -4194,7 +4201,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * mode, e.g. a 32-bit address size can yield a 64-bit virtual * address when using FS/GS with a non-zero base. */ - *ret = s.base + off; + if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) + *ret = s.base + off; + else + *ret = off; /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory @@ -4433,7 +4443,6 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_to_shadow_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 68d231d49c7a..4dea0e0e7e39 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -337,17 +337,22 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) static void intel_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; int i; for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; + pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - pmc_stop_counter(&pmu->fixed_counters[i]); + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = 0; + } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 69536553446d..a279447eb75b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5829,6 +5829,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (unlikely(vmx->fail)) { + dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); @@ -7064,7 +7065,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; - if (kvm_mwait_in_guest(vcpu->kvm)) + if (kvm_mwait_in_guest(vcpu->kvm) || + kvm_can_post_timer_interrupt(vcpu)) return -EOPNOTSUPP; vmx = to_vmx(vcpu); @@ -7453,7 +7455,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { - return 0; + return false; } static __init int hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a0b74ecd1de..58305cf81182 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,7 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> #include <trace/events/kvm.h> @@ -153,6 +154,9 @@ EXPORT_SYMBOL_GPL(enable_vmware_backdoor); static bool __read_mostly force_emulation_prefix = false; module_param(force_emulation_prefix, bool, S_IRUGO); +int __read_mostly pi_inject_timer = -1; +module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1456,12 +1460,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. This function is only called from - * the physical CPU that is running vcpu. - */ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); } static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) @@ -1540,9 +1540,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1785,12 +1782,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -1809,8 +1804,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); } /* @@ -6911,7 +6904,6 @@ static void kvm_timer_init(void) cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); @@ -7070,6 +7062,8 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); + if (pi_inject_timer == -1) + pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e08a12892e8b..6594020c0691 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -301,6 +301,8 @@ extern unsigned int min_timer_period_us; extern bool enable_vmware_backdoor; +extern int pi_inject_timer; + extern struct static_key kvm_no_apic_vcpu; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) |