summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-12 14:34:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-12 14:34:06 -0800
commit3370b69eb0c1f6a05f9051e8fc3e8768461a80f7 (patch)
tree722a75d9d3cc605bfd513752054dea1df806385d /arch/x86/kvm
parentbe23c9d20b341a58ad7107f9e9aa5735cea3da13 (diff)
parent370289756beccdc2a6ef9c7998fe67a7ca68bfc2 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull second batch of kvm updates from Paolo Bonzini: "Four changes: - x86: work around two nasty cases where a benign exception occurs while another is being delivered. The endless stream of exceptions causes an infinite loop in the processor, which not even NMIs or SMIs can interrupt; in the virt case, there is no possibility to exit to the host either. - x86: support for Skylake per-guest TSC rate. Long supported by AMD, the patches mostly move things from there to common arch/x86/kvm/ code. - generic: remove local_irq_save/restore from the guest entry and exit paths when context tracking is enabled. The patches are a few months old, but we discussed them again at kernel summit. Andy will pick up from here and, in 4.5, try to remove it from the user entry/exit paths. - PPC: Two bug fixes, see merge commit 370289756becc for details" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (21 commits) KVM: x86: rename update_db_bp_intercept to update_bp_intercept KVM: svm: unconditionally intercept #DB KVM: x86: work around infinite loop in microcode when #AC is delivered context_tracking: avoid irq_save/irq_restore on guest entry and exit context_tracking: remove duplicate enabled check KVM: VMX: Dump TSC multiplier in dump_vmcs() KVM: VMX: Use a scaled host TSC for guest readings of MSR_IA32_TSC KVM: VMX: Setup TSC scaling ratio when a vcpu is loaded KVM: VMX: Enable and initialize VMX TSC scaling KVM: x86: Use the correct vcpu's TSC rate to compute time scale KVM: x86: Move TSC scaling logic out of call-back read_l1_tsc() KVM: x86: Move TSC scaling logic out of call-back adjust_tsc_offset() KVM: x86: Replace call-back compute_tsc_offset() with a common function KVM: x86: Replace call-back set_tsc_khz() with a common function KVM: x86: Add a common TSC scaling function KVM: x86: Add a common TSC scaling ratio field in kvm_vcpu_arch KVM: x86: Collect information for setting TSC scaling ratio KVM: x86: declare a few variables as __read_mostly KVM: x86: merge handle_mmio_page_fault and handle_mmio_page_fault_common KVM: PPC: Book3S HV: Don't dynamically split core when already split ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c20
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/svm.c140
-rw-r--r--arch/x86/kvm/vmx.c71
-rw-r--r--arch/x86/kvm/x86.c159
7 files changed, 209 insertions, 194 deletions
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ecd4ea1d28a8..4d30b865be30 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1250,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
/* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1318,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7d85bcae3332..e7c2c1428a69 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3359,7 +3359,7 @@ exit:
return reserved;
}
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
@@ -3368,7 +3368,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
- if (unlikely(reserved))
+ if (WARN_ON(reserved))
return RET_MMIO_PF_BUG;
if (is_mmio_spte(spte)) {
@@ -3392,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
*/
return RET_MMIO_PF_RETRY;
}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
-
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
- u32 error_code, bool direct)
-{
- int ret;
-
- ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret == RET_MMIO_PF_BUG);
- return ret;
-}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
@@ -3413,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gva, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -3503,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e4202e41d535..55ffb7b0f95e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,13 +56,13 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
/*
- * Return values of handle_mmio_page_fault_common:
+ * Return values of handle_mmio_page_fault:
* RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
* directly.
* RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
* fault path update the mmio spte.
* RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: bug is detected.
+ * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
*/
enum {
RET_MMIO_PF_EMULATE = 1,
@@ -71,7 +71,7 @@ enum {
RET_MMIO_PF_BUG = -1
};
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b41faa91a6f9..3058a22a658d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -705,8 +705,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, addr, error_code,
- mmu_is_nested(vcpu));
+ r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
if (likely(r != RET_MMIO_PF_INVALID))
return r;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f2c8e4917688..83a1c643f9a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -158,8 +158,6 @@ struct vcpu_svm {
unsigned long int3_rip;
u32 apf_reason;
- u64 tsc_ratio;
-
/* cached guest cpuid flags for faster access */
bool nrips_enabled : 1;
};
@@ -214,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
-static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -894,20 +891,9 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_FFXSR);
if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- u64 max;
-
kvm_has_tsc_control = true;
-
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated needed because it will always
- * be 1 on all machines and a value of 0 is used to disable
- * tsc-scaling for the vcpu.
- */
- max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
-
- kvm_max_guest_tsc_khz = max;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
}
if (nested) {
@@ -971,68 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 __scale_tsc(u64 ratio, u64 tsc)
-{
- u64 mult, frac, _tsc;
-
- mult = ratio >> 32;
- frac = ratio & ((1ULL << 32) - 1);
-
- _tsc = tsc;
- _tsc *= mult;
- _tsc += (tsc >> 32) * frac;
- _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
-
- return _tsc;
-}
-
-static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 _tsc = tsc;
-
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- _tsc = __scale_tsc(svm->tsc_ratio, tsc);
-
- return _tsc;
-}
-
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 ratio;
- u64 khz;
-
- /* Guest TSC same frequency as host TSC? */
- if (!scale) {
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
- return;
- }
-
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
- return;
- }
-
- khz = user_tsc_khz;
-
- /* TSC scaling required - calculate ratio */
- ratio = khz << 32;
- do_div(ratio, tsc_khz);
-
- if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
- WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
- return;
- }
- svm->tsc_ratio = ratio;
-}
-
static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1059,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (host) {
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- WARN_ON(adjustment < 0);
- adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
- }
-
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1080,15 +998,6 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- u64 tsc;
-
- tsc = svm_scale_tsc(vcpu, rdtsc());
-
- return target_tsc - tsc;
-}
-
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1110,6 +1019,8 @@ static void init_vmcb(struct vcpu_svm *svm)
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1235,8 +1146,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
-
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1322,10 +1231,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
- svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, tsc_ratio);
+ wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+ }
}
}
@@ -1644,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- clr_exception_intercept(svm, DB_VECTOR);
clr_exception_intercept(svm, BP_VECTOR);
- if (svm->nmi_singlestep)
- set_exception_intercept(svm, DB_VECTOR);
-
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
} else
@@ -1763,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(&svm->vcpu);
}
if (svm->vcpu.guest_debug &
@@ -1798,6 +1701,12 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3075,8 +2984,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
- return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, host_tsc);
+ return vmcb->control.tsc_offset + host_tsc;
}
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3086,7 +2994,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, rdtsc());
+ kvm_scale_tsc(vcpu, rdtsc());
break;
}
@@ -3362,6 +3270,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -3745,7 +3654,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
*/
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4371,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .update_db_bp_intercept = update_db_bp_intercept,
+ .update_bp_intercept = update_bp_intercept,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -4443,11 +4351,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .set_tsc_khz = svm_set_tsc_khz,
.read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
- .adjust_tsc_offset = svm_adjust_tsc_offset,
- .compute_tsc_offset = svm_compute_tsc_offset,
+ .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5eb56ed77c1f..87acc5221740 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -107,6 +107,8 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -1172,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1631,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -2053,6 +2061,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ /* Setup TSC multiplier */
+ if (cpu_has_vmx_tsc_scaling())
+ vmcs_write64(TSC_MULTIPLIER,
+ vcpu->arch.tsc_scaling_ratio);
+
vmx->loaded_vmcs->cpu = cpu;
}
@@ -2357,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
/*
* reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
*/
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
{
u64 host_tsc, tsc_offset;
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
+ return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
}
/*
@@ -2382,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-/*
- * Engage any workarounds for mis-matched TSC rates. Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- if (!scale)
- return;
-
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
{
return vmcs_read64(TSC_OFFSET);
@@ -2429,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
@@ -2442,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
offset + adjustment);
}
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- return target_tsc - rdtsc();
-}
-
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2778,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- msr_info->data = guest_read_tsc();
+ msr_info->data = guest_read_tsc(vcpu);
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -3154,7 +3148,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_PCOMMIT;
+ SECONDARY_EXEC_PCOMMIT |
+ SECONDARY_EXEC_TSC_SCALING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -5266,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -5908,7 +5906,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
return 1;
}
- ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ ret = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -6199,6 +6197,12 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
if (enable_apicv)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
@@ -8008,6 +8012,9 @@ static void dump_vmcs(void)
vmcs_read32(IDT_VECTORING_INFO_FIELD),
vmcs_read32(IDT_VECTORING_ERROR_CODE));
pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016lx\n",
+ vmcs_readl(TSC_MULTIPLIER));
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
@@ -10752,7 +10759,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_db_bp_intercept = update_exception_bitmap,
+ .update_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -10826,11 +10833,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .set_tsc_khz = vmx_set_tsc_khz,
.read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
- .compute_tsc_offset = vmx_compute_tsc_offset,
+ .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4a6eff166fc6..00462bd63129 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -93,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-struct kvm_x86_ops *kvm_x86_ops;
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
-static bool ignore_msrs = 0;
+static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
@@ -105,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
-bool kvm_has_tsc_control;
+bool __read_mostly kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 kvm_max_guest_tsc_khz;
+u32 __read_mostly kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+static u64 __read_mostly kvm_default_tsc_scaling_ratio;
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
+static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-static bool backwards_tsc_observed = false;
+static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -1249,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ WARN(1, "user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0)
- return;
+ if (this_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1276,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1322,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1333,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1390,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
@@ -1447,6 +1533,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
#ifdef CONFIG_X86_64
static cycle_t read_tsc(void)
@@ -1608,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz;
+ unsigned long flags, this_tsc_khz, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1645,7 +1745,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kernel_ns = get_kernel_ns();
}
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
@@ -1671,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ tgt_tsc_khz = kvm_has_tsc_control ?
+ vcpu->virtual_tsc_khz : this_tsc_khz;
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = this_tsc_khz;
@@ -2617,7 +2719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -3319,9 +3421,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- kvm_set_tsc_khz(vcpu, user_tsc_khz);
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
- r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -6452,8 +6554,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
- rdtsc());
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -7015,7 +7116,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_db_bp_intercept(vcpu);
+ kvm_x86_ops->update_bp_intercept(vcpu);
r = 0;
@@ -7364,6 +7465,20 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
kvm_init_msr_list();
return 0;
}