From 489f1d6526ab68ca1842398fa3ae95c597fe3d32 Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Mon, 7 Jan 2008 11:14:20 +0200
Subject: KVM: MMU: Update shadow ptes on partial guest pte writes

A guest partial guest pte write will leave shadow_trap_nonpresent_pte
in spte, which generates a vmexit at the next guest access through that pte.

This patch improves this by reading the full guest pte in advance and thus
being able to update the spte and eliminate the vmexit.

This helps pae guests which use two 32-bit writes to set a single 64-bit pte.

[truncation fix by Eric]

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Feng (Eric) Liu <eric.e.liu@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 23 ++++++++++++++++-------
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e55af12e11b7..28f9a44060cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1329,8 +1329,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp,
 				  u64 *spte,
-				  const void *new, int bytes,
-				  int offset_in_pte)
+				  const void *new)
 {
 	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
 		++vcpu->kvm->stat.mmu_pde_zapped;
@@ -1339,9 +1338,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (sp->role.glevels == PT32_ROOT_LEVEL)
-		paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+		paging32_update_pte(vcpu, sp, spte, new);
 	else
-		paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+		paging64_update_pte(vcpu, sp, spte, new);
 }
 
 static bool need_remote_flush(u64 old, u64 new)
@@ -1423,7 +1422,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	struct hlist_node *node, *n;
 	struct hlist_head *bucket;
 	unsigned index;
-	u64 entry;
+	u64 entry, gentry;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
 	unsigned pte_size;
@@ -1433,6 +1432,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int level;
 	int flooded = 0;
 	int npte;
+	int r;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
 	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
@@ -1496,11 +1496,20 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 				continue;
 		}
 		spte = &sp->spt[page_offset / sizeof(*spte)];
+		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+			gentry = 0;
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  gpa & ~(u64)(pte_size - 1),
+						  &gentry, pte_size);
+			new = (const void *)&gentry;
+			if (r < 0)
+				new = NULL;
+		}
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
-					      page_offset & (pte_size - 1));
+			if (new)
+				mmu_pte_write_new_pte(vcpu, sp, spte, new);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ecc0856268c4..c2fd2b96144f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -243,8 +243,7 @@ err:
 }
 
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-			      u64 *spte, const void *pte, int bytes,
-			      int offset_in_pte)
+			      u64 *spte, const void *pte)
 {
 	pt_element_t gpte;
 	unsigned pte_access;
@@ -252,12 +251,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!offset_in_pte && !is_present_pte(gpte))
+		if (!is_present_pte(gpte))
 			set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
 		return;
 	}
-	if (bytes < sizeof(pt_element_t))
-		return;
 	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
 	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
-- 
cgit v1.2.3


From 1ae0a13def678876b9acfb5ac1e2cf7d5d45a60d Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Mon, 7 Jan 2008 13:20:25 +0200
Subject: KVM: MMU: Simplify hash table indexing

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 28f9a44060cc..6f8392d4034e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -559,7 +559,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-	return gfn;
+	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -663,7 +663,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	struct hlist_node *node;
 
 	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
@@ -701,7 +701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	}
 	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
 		 gfn, role.word);
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
 		if (sp->gfn == gfn && sp->role.word == role.word) {
@@ -840,7 +840,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
 	r = 0;
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
@@ -1450,7 +1450,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		vcpu->arch.last_pt_write_count = 1;
 		vcpu->arch.last_pte_updated = NULL;
 	}
-	index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.metaphysical)
-- 
cgit v1.2.3


From e09d082c03e137015bc0a17ca77e4b9dca08a5d7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 18 Jan 2008 12:38:59 +0200
Subject: KVM: x86 emulator: add support for group decoding

Certain x86 instructions use bits 3:5 of the byte following the opcode as an
opcode extension, with the decode sometimes depending on bits 6:7 as well.
Add support for this in the main decoding table rather than an ad-hock
adaptation per opcode.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 79586003397a..46ecf349a116 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -65,6 +65,9 @@
 #define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
 #define String      (1<<10)     /* String instruction (rep capable) */
 #define Stack       (1<<11)     /* Stack instruction (push/pop) */
+#define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
+#define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
 static u16 opcode_table[256] = {
 	/* 0x00 - 0x07 */
@@ -229,6 +232,12 @@ static u16 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
+static u16 group_table[] = {
+};
+
+static u16 group2_table[] = {
+};
+
 /* EFLAGS bit definitions. */
 #define EFLG_OF (1<<11)
 #define EFLG_DF (1<<10)
@@ -763,7 +772,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = 0;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes;
+	int def_op_bytes, def_ad_bytes, group;
 
 	/* Shadow copy of register state. Committed on successful emulation. */
 
@@ -864,12 +873,24 @@ done_prefixes:
 			c->b = insn_fetch(u8, 1, c->eip);
 			c->d = twobyte_table[c->b];
 		}
+	}
 
-		/* Unrecognised? */
-		if (c->d == 0) {
-			DPRINTF("Cannot emulate %02x\n", c->b);
-			return -1;
-		}
+	if (c->d & Group) {
+		group = c->d & GroupMask;
+		c->modrm = insn_fetch(u8, 1, c->eip);
+		--c->eip;
+
+		group = (group << 3) + ((c->modrm >> 3) & 7);
+		if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+			c->d = group2_table[group];
+		else
+			c->d = group_table[group];
+	}
+
+	/* Unrecognised? */
+	if (c->d == 0) {
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		return -1;
 	}
 
 	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-- 
cgit v1.2.3


From 43bb19cd3398d3f544d8e2d6ed6c5c5d7b4e5819 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 18 Jan 2008 12:46:50 +0200
Subject: KVM: x86 emulator: group decoding for group 1A

This adds group decode support for opcode 0x8f.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 46ecf349a116..cf1ce7c316a9 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -69,6 +69,10 @@
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
+enum {
+	Group1A,
+};
+
 static u16 opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
@@ -133,7 +137,7 @@ static u16 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+	0, ModRM | DstReg, 0, Group | Group1A,
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
@@ -233,6 +237,8 @@ static u16 twobyte_table[256] = {
 };
 
 static u16 group_table[] = {
+	[Group1A*8] =
+	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static u16 group2_table[] = {
-- 
cgit v1.2.3


From 7d858a19efe5844a98e060931570359b70dea6d1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 18 Jan 2008 12:58:04 +0200
Subject: KVM: x86 emulator: Group decoding for group 3

This adds group decoding support for opcodes 0xf6, 0xf7 (group 3).

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index cf1ce7c316a9..52e65ae37f06 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -70,7 +70,7 @@
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
 enum {
-	Group1A,
+	Group1A, Group3_Byte, Group3,
 };
 
 static u16 opcode_table[256] = {
@@ -171,8 +171,7 @@ static u16 opcode_table[256] = {
 	0, 0, 0, 0,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
-	ImplicitOps, ImplicitOps,
-	ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
 	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
@@ -239,6 +238,14 @@ static u16 twobyte_table[256] = {
 static u16 group_table[] = {
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+	[Group3_Byte*8] =
+	ByteOp | SrcImm | DstMem | ModRM, 0,
+	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	0, 0, 0, 0,
+	[Group3*8] =
+	DstMem | SrcImm | ModRM | SrcImm, 0,
+	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	0, 0, 0, 0,
 };
 
 static u16 group2_table[] = {
@@ -1070,26 +1077,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
-		/*
-		 * Special case in Grp3: test has an immediate
-		 * source operand.
-		 */
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
 		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
 		break;
 	case 2:	/* not */
@@ -1103,7 +1090,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		rc = X86EMUL_UNHANDLEABLE;
 		break;
 	}
-done:
 	return rc;
 }
 
-- 
cgit v1.2.3


From fd60754e4ffa992586346dd56451723b4c096626 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 18 Jan 2008 13:12:26 +0200
Subject: KVM: x86 emulator: Group decoding for groups 4 and 5

Add group decoding support for opcode 0xfe (group 4) and 0xff (group 5).

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 52e65ae37f06..7310368c4857 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -70,7 +70,7 @@
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
 enum {
-	Group1A, Group3_Byte, Group3,
+	Group1A, Group3_Byte, Group3, Group4, Group5,
 };
 
 static u16 opcode_table[256] = {
@@ -174,7 +174,7 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+	0, 0, Group | Group4, Group | Group5,
 };
 
 static u16 twobyte_table[256] = {
@@ -246,6 +246,12 @@ static u16 group_table[] = {
 	DstMem | SrcImm | ModRM | SrcImm, 0,
 	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
+	[Group4*8] =
+	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	0, 0, 0, 0, 0, 0,
+	[Group5*8] =
+	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
 };
 
 static u16 group2_table[] = {
@@ -1097,7 +1103,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc;
 
 	switch (c->modrm_reg) {
 	case 0:	/* inc */
@@ -1107,36 +1112,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 	case 4: /* jmp abs */
-		if (c->b == 0xff)
-			c->eip = c->dst.val;
-		else {
-			DPRINTF("Cannot emulate %02x\n", c->b);
-			return X86EMUL_UNHANDLEABLE;
-		}
+		c->eip = c->src.val;
 		break;
 	case 6:	/* push */
-
-		/* 64-bit mode: PUSH always pushes a 64-bit operand. */
-
-		if (ctxt->mode == X86EMUL_MODE_PROT64) {
-			c->dst.bytes = 8;
-			rc = ops->read_std((unsigned long)c->dst.ptr,
-					   &c->dst.val, 8, ctxt->vcpu);
-			if (rc != 0)
-				return rc;
-		}
-		register_address_increment(c->regs[VCPU_REGS_RSP],
-					   -c->dst.bytes);
-		rc = ops->write_emulated(register_address(ctxt->ss_base,
-				    c->regs[VCPU_REGS_RSP]), &c->dst.val,
-				    c->dst.bytes, ctxt->vcpu);
-		if (rc != 0)
-			return rc;
-		c->dst.type = OP_NONE;
+		emulate_push(ctxt);
 		break;
-	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		return X86EMUL_UNHANDLEABLE;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From d95058a1a7170ae2af2939cbdab0ff5d5e005238 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 18 Jan 2008 13:36:50 +0200
Subject: KVM: x86 emulator: add group 7 decoding

This adds group decoding for opcode 0x0f 0x01 (group 7).

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 7310368c4857..ef6de16bb597 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -70,7 +70,7 @@
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
 enum {
-	Group1A, Group3_Byte, Group3, Group4, Group5,
+	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
 
 static u16 opcode_table[256] = {
@@ -179,7 +179,7 @@ static u16 opcode_table[256] = {
 
 static u16 twobyte_table[256] = {
 	/* 0x00 - 0x0F */
-	0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+	0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
 	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
 	/* 0x10 - 0x1F */
 	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -252,9 +252,14 @@ static u16 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
 	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+	[Group7*8] =
+	0, 0, ModRM | SrcMem, ModRM | SrcMem,
+	SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, SrcMem | ModRM | ByteOp,
 };
 
 static u16 group2_table[] = {
+	[Group7*8] =
+	SrcNone | ModRM, 0, 0, 0, SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, 0,
 };
 
 /* EFLAGS bit definitions. */
-- 
cgit v1.2.3


From 1d6ad2073e5354912291277c606a57fd37330f04 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 23 Jan 2008 22:26:09 +0200
Subject: KVM: x86 emulator: group decoding for group 1 instructions

Opcodes 0x80-0x83

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ef6de16bb597..22900f70172b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -70,6 +70,7 @@
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
 
 enum {
+	Group1_80, Group1_81, Group1_82, Group1_83,
 	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
 
@@ -130,8 +131,8 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
 	/* 0x80 - 0x87 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+	Group | Group1_80, Group | Group1_81,
+	Group | Group1_82, Group | Group1_83,
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	/* 0x88 - 0x8F */
@@ -236,6 +237,26 @@ static u16 twobyte_table[256] = {
 };
 
 static u16 group_table[] = {
+	[Group1_80*8] =
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	[Group1_81*8] =
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+	[Group1_82*8] =
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+	[Group1_83*8] =
+	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
-- 
cgit v1.2.3


From d196e343361c229496adeda42335856da9d057de Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 24 Jan 2008 11:44:11 +0200
Subject: KVM: MMU: Decouple mmio from shadow page tables

Currently an mmio guest pte is encoded in the shadow pagetable as a
not-present trapping pte, with the SHADOW_IO_MARK bit set.  However
nothing is ever done with this information, so maintaining it is a
useless complication.

This patch moves the check for mmio to before shadow ptes are instantiated,
so the shadow code is never invoked for ptes that reference mmio.  The code
is simpler, and with future work, can be made to handle mmio concurrently.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 34 +++++++++++++++-------------------
 arch/x86/kvm/paging_tmpl.h | 17 ++++++++---------
 2 files changed, 23 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f8392d4034e..6651dfadae50 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -101,8 +101,6 @@ static int dbg = 1;
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 
 #define PT64_LEVEL_BITS 9
@@ -200,7 +198,6 @@ static int is_present_pte(unsigned long pte)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	pte &= ~PT_SHADOW_IO_MARK;
 	return pte != shadow_trap_nonpresent_pte
 		&& pte != shadow_notrap_nonpresent_pte;
 }
@@ -215,11 +212,6 @@ static int is_dirty_pte(unsigned long pte)
 	return pte & PT_DIRTY_MASK;
 }
 
-static int is_io_pte(unsigned long pte)
-{
-	return pte & PT_SHADOW_IO_MARK;
-}
-
 static int is_rmap_pte(u64 pte)
 {
 	return is_shadow_present_pte(pte);
@@ -538,7 +530,7 @@ static int is_empty_shadow_page(u64 *spt)
 	u64 *end;
 
 	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-		if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+		if (*pos != shadow_trap_nonpresent_pte) {
 			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
 			       pos, *pos);
 			return 0;
@@ -926,13 +918,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	if (pte_access & ACC_USER_MASK)
 		spte |= PT_USER_MASK;
 
-	if (is_error_page(page)) {
-		set_shadow_pte(shadow_pte,
-			       shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
-		kvm_release_page_clean(page);
-		return;
-	}
-
 	spte |= page_to_phys(page);
 
 	if ((pte_access & ACC_WRITE_MASK)
@@ -1002,7 +987,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write, gfn, page);
-			return pt_write || is_io_pte(table[index]);
+			return pt_write;
 		}
 
 		if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1039,6 +1024,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	page = gfn_to_page(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
+	/* mmio */
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		up_read(&vcpu->kvm->slots_lock);
+		return 1;
+	}
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __nonpaging_map(vcpu, v, write, gfn, page);
@@ -1406,10 +1398,14 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-	down_read(&current->mm->mmap_sem);
+	down_read(&vcpu->kvm->slots_lock);
 	page = gfn_to_page(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
 
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return;
+	}
 	vcpu->arch.update_pte.gfn = gfn;
 	vcpu->arch.update_pte.page = page;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c2fd2b96144f..4b55f462e2b3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -399,6 +399,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
+	/* mmio */
+	if (is_error_page(page)) {
+		pgprintk("gfn %x is mmio\n", walker.gfn);
+		kvm_release_page_clean(page);
+		up_read(&vcpu->kvm->slots_lock);
+		return 1;
+	}
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
@@ -409,15 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!write_pt)
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
-	/*
-	 * mmio: emulate if accessible, otherwise its a guest fault.
-	 */
-	if (shadow_pte && is_io_pte(*shadow_pte)) {
-		spin_unlock(&vcpu->kvm->mmu_lock);
-		up_read(&vcpu->kvm->slots_lock);
-		return 1;
-	}
-
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3


From 2384d2b32640839a4d4d260ca7c5aa4edbf68d91 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 17 Jan 2008 15:14:33 +0800
Subject: KVM: VMX: Enable Virtual Processor Identification (VPID)

To allow TLB entries to be retained across VM entry and VM exit, the VMM
can now identify distinct address spaces through a new virtual-processor ID
(VPID) field of the VMCS.

[avi: drop vpid_sync_all()]
[avi: add "cc" to asm constraints]

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/vmx.h |  6 ++++
 2 files changed, 81 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e1462880d1f..1157e8a4059b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -37,6 +37,9 @@ MODULE_LICENSE("GPL");
 static int bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, 0);
 
+static int enable_vpid = 1;
+module_param(enable_vpid, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -71,6 +74,7 @@ struct vcpu_vmx {
 			unsigned rip;
 		} irq;
 	} rmode;
+	int vpid;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -86,6 +90,9 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
 
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
 static struct vmcs_config {
 	int size;
 	int order;
@@ -204,6 +211,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 		(irqchip_in_kernel(kvm)));
 }
 
+static inline int cpu_has_vmx_vpid(void)
+{
+	return (vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VPID);
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -214,6 +227,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 	return -1;
 }
 
+static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+{
+    struct {
+	u64 vpid : 16;
+	u64 rsvd : 48;
+	u64 gva;
+    } operand = { vpid, 0, gva };
+
+    asm volatile (ASM_VMX_INVVPID
+		  /* CF==1 or ZF==1 --> rc = -1 */
+		  "; ja 1f ; ud2 ; 1:"
+		  : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -257,6 +284,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 	vmx->launched = 0;
 }
 
+static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+{
+	if (vmx->vpid == 0)
+		return;
+
+	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@@ -490,6 +525,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
 		kvm_migrate_apic_timer(vcpu);
+		vpid_sync_vcpu_all(vmx);
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -971,7 +1007,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
 		min = 0;
 		opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-			SECONDARY_EXEC_WBINVD_EXITING;
+			SECONDARY_EXEC_WBINVD_EXITING |
+			SECONDARY_EXEC_ENABLE_VPID;
 		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
 			return -EIO;
@@ -1239,6 +1276,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	vpid_sync_vcpu_all(to_vmx(vcpu));
+}
+
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
@@ -1275,6 +1317,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	vmx_flush_tlb(vcpu);
 	vmcs_writel(GUEST_CR3, cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
@@ -1494,6 +1537,22 @@ out:
 	return r;
 }
 
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+	int vpid;
+
+	vmx->vpid = 0;
+	if (!enable_vpid || !cpu_has_vmx_vpid())
+		return;
+	spin_lock(&vmx_vpid_lock);
+	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+	if (vpid < VMX_NR_VPIDS) {
+		vmx->vpid = vpid;
+		__set_bit(vpid, vmx_vpid_bitmap);
+	}
+	spin_unlock(&vmx_vpid_lock);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -1532,6 +1591,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
 			exec_control &=
 				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		if (vmx->vpid == 0)
+			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -1704,6 +1765,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write64(APIC_ACCESS_ADDR,
 			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 
+	if (vmx->vpid != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
 	vmx->vcpu.arch.cr0 = 0x60000010;
 	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
 	vmx_set_cr4(&vmx->vcpu, 0);
@@ -1713,6 +1777,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_fpu_activate(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 
+	vpid_sync_vcpu_all(vmx);
+
 	return 0;
 
 out:
@@ -2221,10 +2287,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
 static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 {
 	int max_irr, tpr;
@@ -2489,6 +2551,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	spin_lock(&vmx_vpid_lock);
+	if (vmx->vpid != 0)
+		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	spin_unlock(&vmx_vpid_lock);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
@@ -2505,6 +2571,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
+	allocate_vpid(vmx);
+
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
 		goto free_vcpu;
@@ -2652,6 +2720,8 @@ static int __init vmx_init(void)
 	memset(iova, 0xff, PAGE_SIZE);
 	kunmap(vmx_io_bitmap_b);
 
+	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out1;
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index d52ae8d7303d..436ce0f29092 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -49,6 +49,7 @@
  * Definitions of Secondary Processor-Based VM-Execution Controls.
  */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 
 
@@ -65,6 +66,7 @@
 
 /* VMCS Encodings */
 enum vmcs_field {
+	VIRTUAL_PROCESSOR_ID            = 0x00000000,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -321,4 +323,8 @@ enum vmcs_field {
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 
+#define VMX_NR_VPIDS				(1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
+#define VMX_VPID_EXTENT_ALL_CONTEXT		2
+
 #endif
-- 
cgit v1.2.3


From f2b4b7ddf633ffa24ce7c89c9e0d8a06463484e3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 31 Jan 2008 14:57:37 +0100
Subject: KVM: make EFER_RESERVED_BITS configurable for architecture code

This patch give the SVM and VMX implementations the ability to add some bits
the guest can set in its EFER register.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01552bd1f1..ec9265b354b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -41,7 +41,7 @@
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe;
 
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -428,7 +428,7 @@ static u32 emulated_msrs[] = {
 
 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (efer & EFER_RESERVED_BITS) {
+	if (efer & efer_reserved_bits) {
 		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 		       efer);
 		kvm_inject_gp(vcpu, 0);
@@ -452,6 +452,13 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 #endif
 
+void kvm_enable_efer_bits(u64 mask)
+{
+       efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
-- 
cgit v1.2.3


From 50a37eb4e05efaa7bac6a948fd4db1a48c728b99 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 31 Jan 2008 14:57:38 +0100
Subject: KVM: align valid EFER bits with the features of the host system

This patch aligns the bits the guest can set in the EFER register with the
features in the host processor. Currently it lets EFER.NX disabled if the
processor does not support it and enables EFER.LME and EFER.LMA only for KVM on
64 bit hosts.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c |  3 +++
 arch/x86/kvm/vmx.c |  4 ++++
 arch/x86/kvm/x86.c | 10 +++++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a582f1090e8..ff3bc74af728 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -403,6 +403,9 @@ static __init int svm_hardware_setup(void)
 	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
 	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
 
+	if (boot_cpu_has(X86_FEATURE_NX))
+		kvm_enable_efer_bits(EFER_NX);
+
 	for_each_online_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1157e8a4059b..a509910f6b53 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1117,6 +1117,10 @@ static __init int hardware_setup(void)
 {
 	if (setup_vmcs_config(&vmcs_config) < 0)
 		return -EIO;
+
+	if (boot_cpu_has(X86_FEATURE_NX))
+		kvm_enable_efer_bits(EFER_NX);
+
 	return alloc_kvm_area();
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec9265b354b0..db16f2353e4b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -41,7 +41,15 @@
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-static u64 __read_mostly efer_reserved_bits = 0xfffffffffffff2fe;
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
 
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-- 
cgit v1.2.3


From 9f62e19a1107466b9e9501e23a9dd5acb81fdca1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 31 Jan 2008 14:57:39 +0100
Subject: KVM: VMX: unifdef the EFER specific code

To allow access to the EFER register in 32bit KVM the EFER specific code has to
be exported to the x86 generic code. This patch does this in a backwards
compatible manner.

[avi: add check for EFER-less hosts]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a509910f6b53..76944f2c883b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1335,14 +1335,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	vcpu->arch.cr4 = cr4;
 }
 
-#ifdef CONFIG_X86_64
-
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
 	vcpu->arch.shadow_efer = efer;
+	if (!msr)
+		return;
 	if (efer & EFER_LMA) {
 		vmcs_write32(VM_ENTRY_CONTROLS,
 				     vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1359,8 +1359,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	setup_msrs(vmx);
 }
 
-#endif
-
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1775,9 +1773,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->vcpu.arch.cr0 = 0x60000010;
 	vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
 	vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
 	vmx_set_efer(&vmx->vcpu, 0);
-#endif
 	vmx_fpu_activate(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 
@@ -2668,9 +2664,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_cr0 = vmx_set_cr0,
 	.set_cr3 = vmx_set_cr3,
 	.set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
 	.set_efer = vmx_set_efer,
-#endif
 	.get_idt = vmx_get_idt,
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
-- 
cgit v1.2.3


From 9457a712a2f464c4b21bb7f78998775c69673a0c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 31 Jan 2008 14:57:40 +0100
Subject: KVM: allow access to EFER in 32bit KVM

This patch makes the EFER register accessible on a 32bit KVM host. This is
necessary to boot 32 bit PAE guests under SVM.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index db16f2353e4b..38edb2f558ea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -432,8 +432,6 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 };
 
-#ifdef CONFIG_X86_64
-
 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	if (efer & efer_reserved_bits) {
@@ -458,8 +456,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.shadow_efer = efer;
 }
 
-#endif
-
 void kvm_enable_efer_bits(u64 mask)
 {
        efer_reserved_bits &= ~mask;
@@ -489,11 +485,9 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		set_efer(vcpu, data);
 		break;
-#endif
 	case MSR_IA32_MC0_STATUS:
 		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 		       __FUNCTION__, data);
@@ -571,11 +565,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		data = vcpu->arch.shadow_efer;
 		break;
-#endif
 	default:
 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -2880,9 +2872,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	set_cr8(vcpu, sregs->cr8);
 
 	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
 	kvm_set_apic_base(vcpu, sregs->apic_base);
 
 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-- 
cgit v1.2.3


From 33bd6a0b3e8baed6469c8e68ea1b16cb50c4f5af Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:38 +0100
Subject: KVM: SVM: move feature detection to hardware setup code

By moving the SVM feature detection from the each_cpu code to the hardware
setup code it runs only once. As an additional advance the feature check is now
available earlier in the module setup process.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff3bc74af728..5f527dc0e162 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -302,7 +302,6 @@ static void svm_hardware_enable(void *garbage)
 	svm_data->asid_generation = 1;
 	svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	svm_data->next_asid = svm_data->max_asid + 1;
-	svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
 	asm volatile ("sgdt %0" : "=m"(gdt_descr));
 	gdt = (struct desc_struct *)gdt_descr.address;
@@ -411,6 +410,9 @@ static __init int svm_hardware_setup(void)
 		if (r)
 			goto err_2;
 	}
+
+	svm_features = cpuid_edx(SVM_CPUID_FUNC);
+
 	return 0;
 
 err_2:
-- 
cgit v1.2.3


From e3da3acdb32c1804a5c853feebcc037b7434076f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:39 +0100
Subject: KVM: SVM: add detection of Nested Paging feature

Let SVM detect if the Nested Paging feature is available on the hardware.
Disable it to keep this patch series bisectable.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5f527dc0e162..c12a75953b5b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,8 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
 
+static bool npt_enabled = false;
+
 static void kvm_reput_irq(struct vcpu_svm *svm);
 
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -413,6 +415,12 @@ static __init int svm_hardware_setup(void)
 
 	svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
+	if (!svm_has(SVM_FEATURE_NPT))
+		npt_enabled = false;
+
+	if (npt_enabled)
+		printk(KERN_INFO "kvm: Nested Paging enabled\n");
+
 	return 0;
 
 err_2:
-- 
cgit v1.2.3


From 6c7dac72d5c7dc0e09512dce865398167be9a8f7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:40 +0100
Subject: KVM: SVM: add module parameter to disable Nested Paging

To disable the use of the Nested Paging feature even if it is available in
hardware this patch adds a module parameter. Nested Paging can be disabled by
passing npt=0 to the kvm_amd module.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c12a75953b5b..fb5d6c2e6a08 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -48,6 +48,9 @@ MODULE_LICENSE("GPL");
 #define SVM_DEATURE_SVML (1 << 2)
 
 static bool npt_enabled = false;
+static int npt = 1;
+
+module_param(npt, int, S_IRUGO);
 
 static void kvm_reput_irq(struct vcpu_svm *svm);
 
@@ -418,6 +421,11 @@ static __init int svm_hardware_setup(void)
 	if (!svm_has(SVM_FEATURE_NPT))
 		npt_enabled = false;
 
+	if (npt_enabled && !npt) {
+		printk(KERN_INFO "kvm: Nested Paging disabled\n");
+		npt_enabled = false;
+	}
+
 	if (npt_enabled)
 		printk(KERN_INFO "kvm: Nested Paging enabled\n");
 
-- 
cgit v1.2.3


From 1855267210e1a8c9d41fe3a3c7a0d42eca5fb7cd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:41 +0100
Subject: KVM: export information about NPT to generic x86 code

The generic x86 code has to know if the specific implementation uses Nested
Paging. In the generic code Nested Paging is called Two Dimensional Paging
(TDP) to avoid confusion with (future) TDP implementations of other vendors.
This patch exports the availability of TDP to the generic x86 code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 15 +++++++++++++++
 arch/x86/kvm/svm.c |  4 +++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6651dfadae50..21cfa289d0fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -32,6 +32,15 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+static bool tdp_enabled = false;
+
 #undef MMU_DEBUG
 
 #undef AUDIT
@@ -1582,6 +1591,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
+void kvm_enable_tdp(void)
+{
+	tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fb5d6c2e6a08..9e29a13136c4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -426,8 +426,10 @@ static __init int svm_hardware_setup(void)
 		npt_enabled = false;
 	}
 
-	if (npt_enabled)
+	if (npt_enabled) {
 		printk(KERN_INFO "kvm: Nested Paging enabled\n");
+		kvm_enable_tdp();
+	}
 
 	return 0;
 
-- 
cgit v1.2.3


From 4d9976bbdc09e08b69fc12fee2042c3528187b32 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:42 +0100
Subject: KVM: MMU: make the __nonpaging_map function generic

The mapping function for the nonpaging case in the softmmu does basically the
same as required for Nested Paging. Make this function generic so it can be
used for both.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 21cfa289d0fe..33cd7c982dd3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -979,10 +979,9 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
-			   gfn_t gfn, struct page *page)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			   gfn_t gfn, struct page *page, int level)
 {
-	int level = PT32E_ROOT_LEVEL;
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
 
@@ -1042,7 +1041,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
-	r = __nonpaging_map(vcpu, v, write, gfn, page);
+	r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	up_read(&vcpu->kvm->slots_lock);
-- 
cgit v1.2.3


From cc4b6871e771e76dc1de06adb8aed261a1c66be8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:43 +0100
Subject: KVM: export the load_pdptrs() function to modules

The load_pdptrs() function is required in the SVM module for NPT support.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38edb2f558ea..0c910c774a9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -213,6 +213,7 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(load_pdptrs);
 
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
-- 
cgit v1.2.3


From fb72d1674d860b0c9ef9b66b7f4f01fe5b3d2c00 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:44 +0100
Subject: KVM: MMU: add TDP support to the KVM MMU

This patch contains the changes to the KVM MMU necessary for support of the
Nested Paging feature in AMD Barcelona and Phenom Processors.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/mmu.h |  6 +++++
 2 files changed, 82 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 33cd7c982dd3..f7541fe22cd8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1097,6 +1097,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
+	int metaphysical = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
@@ -1105,14 +1106,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
+		if (tdp_enabled)
+			metaphysical = 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
+				      PT64_ROOT_LEVEL, metaphysical,
+				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
 #endif
+	metaphysical = !is_paging(vcpu);
+	if (tdp_enabled)
+		metaphysical = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1126,7 +1133,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, !is_paging(vcpu),
+				      PT32_ROOT_LEVEL, metaphysical,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -1160,6 +1167,36 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 			     error_code & PFERR_WRITE_MASK, gfn);
 }
 
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+				u32 error_code)
+{
+	struct page *page;
+	int r;
+
+	ASSERT(vcpu);
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		up_read(&current->mm->mmap_sem);
+		return 1;
+	}
+	spin_lock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_free_some_pages(vcpu);
+	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+			 gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	up_read(&current->mm->mmap_sem);
+
+	return r;
+}
+
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
 	mmu_free_roots(vcpu);
@@ -1253,7 +1290,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
 	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
 
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+
+	context->new_cr3 = nonpaging_new_cr3;
+	context->page_fault = tdp_page_fault;
+	context->free = nonpaging_free;
+	context->prefetch_page = nonpaging_prefetch_page;
+	context->shadow_root_level = TDP_ROOT_LEVEL;
+	context->root_hpa = INVALID_PAGE;
+
+	if (!is_paging(vcpu)) {
+		context->gva_to_gpa = nonpaging_gva_to_gpa;
+		context->root_level = 0;
+	} else if (is_long_mode(vcpu)) {
+		context->gva_to_gpa = paging64_gva_to_gpa;
+		context->root_level = PT64_ROOT_LEVEL;
+	} else if (is_pae(vcpu)) {
+		context->gva_to_gpa = paging64_gva_to_gpa;
+		context->root_level = PT32E_ROOT_LEVEL;
+	} else {
+		context->gva_to_gpa = paging32_gva_to_gpa;
+		context->root_level = PT32_ROOT_LEVEL;
+	}
+
+	return 0;
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1268,6 +1333,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 		return paging32_init_context(vcpu);
 }
 
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+	if (tdp_enabled)
+		return init_kvm_tdp_mmu(vcpu);
+	else
+		return init_kvm_softmmu(vcpu);
+}
+
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fce19ec7a23..e64e9f56a65e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,12 @@
 
 #include <linux/kvm_host.h>
 
+#ifdef CONFIG_X86_64
+#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
+#else
+#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
+#endif
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-- 
cgit v1.2.3


From 709ddebf81cb40e3c36c6109a7892e8b93a09464 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Feb 2008 13:47:45 +0100
Subject: KVM: SVM: add support for Nested Paging

This patch contains the SVM architecture dependent changes for KVM to enable
support for the Nested Paging feature of AMD Barcelona and Phenom processors.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9e29a13136c4..8e9d4a5dacda 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,7 +47,12 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
 
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
 static bool npt_enabled = false;
+#endif
 static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
@@ -187,7 +192,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (!(efer & EFER_LMA))
+	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
 	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
@@ -573,6 +578,22 @@ static void init_vmcb(struct vmcb *vmcb)
 	save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
+
+	if (npt_enabled) {
+		/* Setup VMCB for Nested Paging */
+		control->nested_ctl = 1;
+		control->intercept_exceptions &= ~(1 << PF_VECTOR);
+		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
+						INTERCEPT_CR3_MASK);
+		control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
+						 INTERCEPT_CR3_MASK);
+		save->g_pat = 0x0007040600070406ULL;
+		/* enable caching because the QEMU Bios doesn't enable it */
+		save->cr0 = X86_CR0_ET;
+		save->cr3 = 0;
+		save->cr4 = 0;
+	}
+
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -807,6 +828,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		}
 	}
 #endif
+	if (npt_enabled)
+		goto set;
+
 	if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
 		svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
 		vcpu->fpu_active = 1;
@@ -814,18 +838,26 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	vcpu->arch.cr0 = cr0;
 	cr0 |= X86_CR0_PG | X86_CR0_WP;
-	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	if (!vcpu->fpu_active) {
 		svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
 		cr0 |= X86_CR0_TS;
 	}
+set:
+	/*
+	 * re-enable caching here because the QEMU bios
+	 * does not do it - this results in some delay at
+	 * reboot
+	 */
+	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	svm->vmcb->save.cr0 = cr0;
 }
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        vcpu->arch.cr4 = cr4;
-       to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+       if (!npt_enabled)
+	       cr4 |= X86_CR4_PAE;
+       to_svm(vcpu)->vmcb->save.cr4 = cr4;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1313,14 +1345,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
 	[SVM_EXIT_MONITOR]			= invalid_op_interception,
 	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+	[SVM_EXIT_NPF]				= pf_interception,
 };
 
-
 static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 exit_code = svm->vmcb->control.exit_code;
 
+	if (npt_enabled) {
+		int mmu_reload = 0;
+		if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
+			svm_set_cr0(vcpu, svm->vmcb->save.cr0);
+			mmu_reload = 1;
+		}
+		vcpu->arch.cr0 = svm->vmcb->save.cr0;
+		vcpu->arch.cr3 = svm->vmcb->save.cr3;
+		if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+			if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+				kvm_inject_gp(vcpu, 0);
+				return 1;
+			}
+		}
+		if (mmu_reload) {
+			kvm_mmu_reset_context(vcpu);
+			kvm_mmu_load(vcpu);
+		}
+	}
+
 	kvm_reput_irq(svm);
 
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
@@ -1331,7 +1383,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
-	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+	    exit_code != SVM_EXIT_NPF)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __FUNCTION__, svm->vmcb->control.exit_int_info,
@@ -1522,6 +1575,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	svm->host_dr6 = read_dr6();
 	svm->host_dr7 = read_dr7();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+	/* required for live migration with NPT */
+	if (npt_enabled)
+		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
 	if (svm->vmcb->save.dr7 & 0xff) {
 		write_dr7(0);
@@ -1665,6 +1721,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (npt_enabled) {
+		svm->vmcb->control.nested_cr3 = root;
+		force_new_asid(vcpu);
+		return;
+	}
+
 	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 
-- 
cgit v1.2.3


From 2e11384c2c6f1ce662b1e5b05ba49b216a052f2a Mon Sep 17 00:00:00 2001
From: Ryan Harper <ryanh@us.ibm.com>
Date: Mon, 11 Feb 2008 10:26:38 -0600
Subject: KVM: VMX: fix typo in VMX header define

Looking at Intel Volume 3b, page 148, table 20-11 and noticed
that the field name is 'Deliver' not 'Deliever'.  Attached patch changes
the define name and its user in vmx.c

Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 arch/x86/kvm/vmx.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 76944f2c883b..2d5ccec3f9e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -632,7 +632,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 		     nr | INTR_TYPE_EXCEPTION
-		     | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
 		     | INTR_INFO_VALID_MASK);
 	if (has_error_code)
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -1935,7 +1935,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	error_code = 0;
 	rip = vmcs_readl(GUEST_RIP);
-	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
@@ -2351,7 +2351,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
 
-		if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+		if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
 			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
 				vmcs_read32(IDT_VECTORING_ERROR_CODE));
 		if (unlikely(has_ext_irq))
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 436ce0f29092..5dff4606b988 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -233,12 +233,12 @@ enum vmcs_field {
  */
 #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
 #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
+#define INTR_INFO_DELIVER_CODE_MASK     0x800           /* 11 */
 #define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
 
 #define VECTORING_INFO_VECTOR_MASK           	INTR_INFO_VECTOR_MASK
 #define VECTORING_INFO_TYPE_MASK        	INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK    	INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK    	INTR_INFO_DELIVER_CODE_MASK
 #define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
-- 
cgit v1.2.3


From e6101a96c9efb74c98bba6322d4c5ea89e47e0fe Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 13 Feb 2008 18:58:45 +0100
Subject: KVM: SVM: let init_vmcb() take struct vcpu_svm as parameter

Change the parameter of the init_vmcb() function in the kvm-amd module from
struct vmcb to struct vcpu_svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8e9d4a5dacda..d934819733ce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -471,10 +471,10 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
-static void init_vmcb(struct vmcb *vmcb)
+static void init_vmcb(struct vcpu_svm *svm)
 {
-	struct vmcb_control_area *control = &vmcb->control;
-	struct vmcb_save_area *save = &vmcb->save;
+	struct vmcb_control_area *control = &svm->vmcb->control;
+	struct vmcb_save_area *save = &svm->vmcb->save;
 
 	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
@@ -600,7 +600,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	init_vmcb(svm->vmcb);
+	init_vmcb(svm);
 
 	if (vcpu->vcpu_id != 0) {
 		svm->vmcb->save.rip = 0;
@@ -638,7 +638,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	memset(svm->db_regs, 0, sizeof(svm->db_regs));
-	init_vmcb(svm->vmcb);
+	init_vmcb(svm);
 
 	fx_init(&svm->vcpu);
 	svm->vcpu.fpu_active = 1;
@@ -1024,7 +1024,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	 * so reinitialize it.
 	 */
 	clear_page(svm->vmcb);
-	init_vmcb(svm->vmcb);
+	init_vmcb(svm);
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
-- 
cgit v1.2.3


From f65c229c3e7743c6654c16b9ec6248466b5eef21 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 13 Feb 2008 18:58:46 +0100
Subject: KVM: SVM: allocate the MSR permission map per VCPU

This patch changes the kvm-amd module to allocate the SVM MSR permission map
per VCPU instead of a global map for all VCPUs. With this we have more
flexibility allowing specific guests to access virtualized MSRs. This is
required for LBR virtualization.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/kvm_svm.h |  2 ++
 arch/x86/kvm/svm.c     | 67 ++++++++++++++++++++++++--------------------------
 2 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index ecdfe97e4635..65ef0fc2c036 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -39,6 +39,8 @@ struct vcpu_svm {
 	unsigned long host_db_regs[NUM_DB_REGS];
 	unsigned long host_dr6;
 	unsigned long host_dr7;
+
+	u32 *msrpm;
 };
 
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d934819733ce..281a2ffe1224 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -65,7 +65,6 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 }
 
 unsigned long iopm_base;
-unsigned long msrpm_base;
 
 struct kvm_ldttss_desc {
 	u16 limit0;
@@ -370,12 +369,29 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	BUG();
 }
 
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+#ifdef CONFIG_X86_64
+	set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
+	set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
+	set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
+	set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
+	set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
+	set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
+#endif
+	set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
 	struct page *iopm_pages;
-	struct page *msrpm_pages;
-	void *iopm_va, *msrpm_va;
+	void *iopm_va;
 	int r;
 
 	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
@@ -388,37 +404,13 @@ static __init int svm_hardware_setup(void)
 	clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
-
-	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
-
-	r = -ENOMEM;
-	if (!msrpm_pages)
-		goto err_1;
-
-	msrpm_va = page_address(msrpm_pages);
-	memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-	msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-	set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
-	set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
-	set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
-	set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
-
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
 	for_each_online_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
-			goto err_2;
+			goto err;
 	}
 
 	svm_features = cpuid_edx(SVM_CPUID_FUNC);
@@ -438,10 +430,7 @@ static __init int svm_hardware_setup(void)
 
 	return 0;
 
-err_2:
-	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-	msrpm_base = 0;
-err_1:
+err:
 	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
 	iopm_base = 0;
 	return r;
@@ -449,9 +438,8 @@ err_1:
 
 static __exit void svm_hardware_unsetup(void)
 {
-	__free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
 	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
-	iopm_base = msrpm_base = 0;
+	iopm_base = 0;
 }
 
 static void init_seg(struct vmcb_seg *seg)
@@ -536,7 +524,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 				(1ULL << INTERCEPT_MWAIT);
 
 	control->iopm_base_pa = iopm_base;
-	control->msrpm_base_pa = msrpm_base;
+	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->tsc_offset = 0;
 	control->int_ctl = V_INTR_MASKING_MASK;
 
@@ -615,6 +603,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	struct vcpu_svm *svm;
 	struct page *page;
+	struct page *msrpm_pages;
 	int err;
 
 	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -633,6 +622,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto uninit;
 	}
 
+	err = -ENOMEM;
+	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	if (!msrpm_pages)
+		goto uninit;
+	svm->msrpm = page_address(msrpm_pages);
+	svm_vcpu_init_msrpm(svm->msrpm);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
@@ -661,6 +657,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
-- 
cgit v1.2.3


From 24e09cbf480a72f9c952af4ca77b159503dca44b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 13 Feb 2008 18:58:47 +0100
Subject: KVM: SVM: enable LBR virtualization

This patch implements the Last Branch Record Virtualization (LBRV) feature of
the AMD Barcelona and Phenom processors into the kvm-amd module. It will only
be enabled if the guest enables last branch recording in the DEBUG_CTL MSR. So
there is no increased world switch overhead when the guest doesn't use these
MSRs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Markus Rechberger <markus.rechberger@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 281a2ffe1224..7d73e935dcc1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,8 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_DEATURE_SVML (1 << 2)
 
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -387,6 +389,28 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
 	set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
 }
 
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+	u32 *msrpm = svm->msrpm;
+
+	svm->vmcb->control.lbr_ctl = 1;
+	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+	u32 *msrpm = svm->msrpm;
+
+	svm->vmcb->control.lbr_ctl = 0;
+	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1231,8 +1255,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		svm->vmcb->save.sysenter_esp = data;
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
-		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
-				__FUNCTION__, data);
+		if (!svm_has(SVM_FEATURE_LBRV)) {
+			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+					__FUNCTION__, data);
+			break;
+		}
+		if (data & DEBUGCTL_RESERVED_BITS)
+			return 1;
+
+		svm->vmcb->save.dbgctl = data;
+		if (data & (1ULL<<0))
+			svm_enable_lbrv(svm);
+		else
+			svm_disable_lbrv(svm);
 		break;
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_EVNTSEL1:
-- 
cgit v1.2.3


From 18068523d3a0b41fcee5b53cdb437a0ab4d65e4b Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Fri, 15 Feb 2008 17:52:47 -0200
Subject: KVM: paravirtualized clocksource: host part

This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writing to it periodically.

The area is binary compatible with xen, as we use the same shadow_info
structure.

[marcelo: fix bad_page on MSR_KVM_SYSTEM_TIME]
[avi: save full value of the msr, even if enable bit is clear]
[avi: clear previous value of time_page]

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c774a9b..256c0fc92b67 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 
+#include <linux/clocksource.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
@@ -424,7 +425,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TIME_STAMP_COUNTER,
+	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -482,6 +483,70 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 	return kvm_set_msr(vcpu, index, *data);
 }
 
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+	static int version;
+	struct kvm_wall_clock wc;
+	struct timespec wc_ts;
+
+	if (!wall_clock)
+		return;
+
+	version++;
+
+	down_read(&kvm->slots_lock);
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+	wc_ts = current_kernel_time();
+	wc.wc_sec = wc_ts.tv_sec;
+	wc.wc_nsec = wc_ts.tv_nsec;
+	wc.wc_version = version;
+
+	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+	version++;
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+	up_read(&kvm->slots_lock);
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+	struct timespec ts;
+	unsigned long flags;
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	void *shared_kaddr;
+
+	if ((!vcpu->time_page))
+		return;
+
+	/* Keep irq disabled to prevent changes to the clock */
+	local_irq_save(flags);
+	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+			  &vcpu->hv_clock.tsc_timestamp);
+	ktime_get_ts(&ts);
+	local_irq_restore(flags);
+
+	/* With all the info we got, fill in the values */
+
+	vcpu->hv_clock.system_time = ts.tv_nsec +
+				     (NSEC_PER_SEC * (u64)ts.tv_sec);
+	/*
+	 * The interface expects us to write an even number signaling that the
+	 * update is finished. Since the guest won't see the intermediate
+	 * state, we just write "2" at the end
+	 */
+	vcpu->hv_clock.version = 2;
+
+	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+		sizeof(vcpu->hv_clock));
+
+	kunmap_atomic(shared_kaddr, KM_USER0);
+
+	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -511,6 +576,44 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_KVM_WALL_CLOCK:
+		vcpu->kvm->arch.wall_clock = data;
+		kvm_write_wall_clock(vcpu->kvm, data);
+		break;
+	case MSR_KVM_SYSTEM_TIME: {
+		if (vcpu->arch.time_page) {
+			kvm_release_page_dirty(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
+
+		vcpu->arch.time = data;
+
+		/* we verify if the enable bit is set... */
+		if (!(data & 1))
+			break;
+
+		/* ...but clean it before doing the actual write */
+		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+
+		vcpu->arch.hv_clock.tsc_to_system_mul =
+					clocksource_khz2mult(tsc_khz, 22);
+		vcpu->arch.hv_clock.tsc_shift = 22;
+
+		down_read(&current->mm->mmap_sem);
+		down_read(&vcpu->kvm->slots_lock);
+		vcpu->arch.time_page =
+				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+		up_read(&vcpu->kvm->slots_lock);
+		up_read(&current->mm->mmap_sem);
+
+		if (is_error_page(vcpu->arch.time_page)) {
+			kvm_release_page_clean(vcpu->arch.time_page);
+			vcpu->arch.time_page = NULL;
+		}
+
+		kvm_write_guest_time(vcpu);
+		break;
+	}
 	default:
 		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 		return 1;
@@ -569,6 +672,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_EFER:
 		data = vcpu->arch.shadow_efer;
 		break;
+	case MSR_KVM_WALL_CLOCK:
+		data = vcpu->kvm->arch.wall_clock;
+		break;
+	case MSR_KVM_SYSTEM_TIME:
+		data = vcpu->arch.time;
+		break;
 	default:
 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -696,6 +805,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
@@ -771,6 +881,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
+	kvm_write_guest_time(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From ddcb2885e2902ebfc422eccd763b02c5ee22d68b Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 18 Feb 2008 11:12:48 -0800
Subject: KVM: x86 emulator: add ad_mask static inline

Replaces open-coded mask calculation in macros.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 22900f70172b..f6f6544cf747 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -480,10 +480,15 @@ static u16 group2_table[] = {
 	(_type)_x;							\
 })
 
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+	return (1UL << (c->ad_bytes << 3)) - 1;
+}
+
 /* Access/update address held in a register, based on addressing mode. */
 #define address_mask(reg)						\
 	((c->ad_bytes == sizeof(unsigned long)) ? 			\
-		(reg) :	((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+		(reg) :	((reg) & ad_mask(c)))
 #define register_address(base, reg)                                     \
 	((base) + address_mask(reg))
 #define register_address_increment(reg, inc)                            \
@@ -494,9 +499,9 @@ static u16 group2_table[] = {
 			(reg) += _inc;					\
 		else							\
 			(reg) = ((reg) & 				\
-				 ~((1UL << (c->ad_bytes << 3)) - 1)) |	\
+				 ~ad_mask(c)) |	\
 				(((reg) + _inc) &			\
-				 ((1UL << (c->ad_bytes << 3)) - 1));	\
+				 ad_mask(c));	\
 	} while (0)
 
 #define JMP_REL(rel) 							\
-- 
cgit v1.2.3


From e4706772ea46e57cf69a7140c40063a21884c8e0 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 19 Feb 2008 07:40:38 -0800
Subject: KVM: x86 emulator: make register_address, address_mask static inlines

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 48 ++++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f6f6544cf747..008db4dad7b2 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -486,11 +486,21 @@ static inline unsigned long ad_mask(struct decode_cache *c)
 }
 
 /* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)						\
-	((c->ad_bytes == sizeof(unsigned long)) ? 			\
-		(reg) :	((reg) & ad_mask(c)))
-#define register_address(base, reg)                                     \
-	((base) + address_mask(reg))
+static inline unsigned long
+address_mask(struct decode_cache *c, unsigned long reg)
+{
+	if (c->ad_bytes == sizeof(unsigned long))
+		return reg;
+	else
+		return reg & ad_mask(c);
+}
+
+static inline unsigned long
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+{
+	return base + address_mask(c, reg);
+}
+
 #define register_address_increment(reg, inc)                            \
 	do {								\
 		/* signed type ensures sign extension to long */        \
@@ -1056,7 +1066,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(ctxt->ss_base,
+	c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
 					       c->regs[VCPU_REGS_RSP]);
 }
 
@@ -1066,7 +1076,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = ops->read_std(register_address(ctxt->ss_base,
+	rc = ops->read_std(register_address(c, ctxt->ss_base,
 					    c->regs[VCPU_REGS_RSP]),
 			   &c->dst.val, c->dst.bytes, ctxt->vcpu);
 	if (rc != 0)
@@ -1388,11 +1398,11 @@ special_insn:
 		register_address_increment(c->regs[VCPU_REGS_RSP],
 					   -c->op_bytes);
 		c->dst.ptr = (void *) register_address(
-			ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+			c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		if ((rc = ops->read_std(register_address(ctxt->ss_base,
+		if ((rc = ops->read_std(register_address(c, ctxt->ss_base,
 			c->regs[VCPU_REGS_RSP]), c->dst.ptr,
 			c->op_bytes, ctxt->vcpu)) != 0)
 			goto done;
@@ -1417,9 +1427,9 @@ special_insn:
 				1,
 				(c->d & ByteOp) ? 1 : c->op_bytes,
 				c->rep_prefix ?
-				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
 				(ctxt->eflags & EFLG_DF),
-				register_address(ctxt->es_base,
+				register_address(c, ctxt->es_base,
 						 c->regs[VCPU_REGS_RDI]),
 				c->rep_prefix,
 				c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1433,9 +1443,9 @@ special_insn:
 				0,
 				(c->d & ByteOp) ? 1 : c->op_bytes,
 				c->rep_prefix ?
-				address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
 				(ctxt->eflags & EFLG_DF),
-				register_address(c->override_base ?
+				register_address(c, c->override_base ?
 							*c->override_base :
 							ctxt->ds_base,
 						 c->regs[VCPU_REGS_RSI]),
@@ -1525,10 +1535,10 @@ special_insn:
 	case 0xa4 ... 0xa5:	/* movs */
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
+		c->dst.ptr = (unsigned long *)register_address(c,
 						   ctxt->es_base,
 						   c->regs[VCPU_REGS_RDI]);
-		if ((rc = ops->read_emulated(register_address(
+		if ((rc = ops->read_emulated(register_address(c,
 		      c->override_base ? *c->override_base :
 					ctxt->ds_base,
 					c->regs[VCPU_REGS_RSI]),
@@ -1545,7 +1555,7 @@ special_insn:
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->src.type = OP_NONE; /* Disable writeback. */
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)register_address(
+		c->src.ptr = (unsigned long *)register_address(c,
 				c->override_base ? *c->override_base :
 						   ctxt->ds_base,
 						   c->regs[VCPU_REGS_RSI]);
@@ -1557,7 +1567,7 @@ special_insn:
 
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
+		c->dst.ptr = (unsigned long *)register_address(c,
 						   ctxt->es_base,
 						   c->regs[VCPU_REGS_RDI]);
 		if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
@@ -1581,7 +1591,7 @@ special_insn:
 	case 0xaa ... 0xab:	/* stos */
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(
+		c->dst.ptr = (unsigned long *)register_address(c,
 						   ctxt->es_base,
 						   c->regs[VCPU_REGS_RDI]);
 		c->dst.val = c->regs[VCPU_REGS_RAX];
@@ -1593,7 +1603,7 @@ special_insn:
 		c->dst.type = OP_REG;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		if ((rc = ops->read_emulated(register_address(
+		if ((rc = ops->read_emulated(register_address(c,
 				c->override_base ? *c->override_base :
 						   ctxt->ds_base,
 						 c->regs[VCPU_REGS_RSI]),
-- 
cgit v1.2.3


From 7a95727567f0991751c2db774a110b4f8080de7f Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 19 Feb 2008 07:40:41 -0800
Subject: KVM: x86 emulator: make register_address_increment and JMP_REL static
 inlines

Change jmp_rel() to a function as well.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 56 +++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 008db4dad7b2..cacdcf5f32d7 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -501,23 +501,19 @@ register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
 	return base + address_mask(c, reg);
 }
 
-#define register_address_increment(reg, inc)                            \
-	do {								\
-		/* signed type ensures sign extension to long */        \
-		int _inc = (inc);					\
-		if (c->ad_bytes == sizeof(unsigned long))		\
-			(reg) += _inc;					\
-		else							\
-			(reg) = ((reg) & 				\
-				 ~ad_mask(c)) |	\
-				(((reg) + _inc) &			\
-				 ad_mask(c));	\
-	} while (0)
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+	if (c->ad_bytes == sizeof(unsigned long))
+		*reg += inc;
+	else
+		*reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
 
-#define JMP_REL(rel) 							\
-	do {								\
-		register_address_increment(c->eip, rel);		\
-	} while (0)
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+	register_address_increment(c, &c->eip, rel);
+}
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
@@ -1065,7 +1061,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 	c->dst.type  = OP_MEM;
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
-	register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
 	c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
 					       c->regs[VCPU_REGS_RSP]);
 }
@@ -1082,7 +1078,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
 
 	return 0;
 }
@@ -1395,7 +1391,7 @@ special_insn:
 		c->dst.type  = OP_MEM;
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = c->src.val;
-		register_address_increment(c->regs[VCPU_REGS_RSP],
+		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
 					   -c->op_bytes);
 		c->dst.ptr = (void *) register_address(
 			c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
@@ -1407,7 +1403,7 @@ special_insn:
 			c->op_bytes, ctxt->vcpu)) != 0)
 			goto done;
 
-		register_address_increment(c->regs[VCPU_REGS_RSP],
+		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
 					   c->op_bytes);
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
@@ -1459,7 +1455,7 @@ special_insn:
 		int rel = insn_fetch(s8, 1, c->eip);
 
 		if (test_cc(c->b, ctxt->eflags))
-			JMP_REL(rel);
+			jmp_rel(c, rel);
 		break;
 	}
 	case 0x80 ... 0x83:	/* Grp1 */
@@ -1545,10 +1541,10 @@ special_insn:
 					&c->dst.val,
 					c->dst.bytes, ctxt->vcpu)) != 0)
 			goto done;
-		register_address_increment(c->regs[VCPU_REGS_RSI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
 				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
 							   : c->dst.bytes);
-		register_address_increment(c->regs[VCPU_REGS_RDI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
 				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
 							   : c->dst.bytes);
 		break;
@@ -1580,10 +1576,10 @@ special_insn:
 
 		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
 
-		register_address_increment(c->regs[VCPU_REGS_RSI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
 				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
 								  : c->src.bytes);
-		register_address_increment(c->regs[VCPU_REGS_RDI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
 				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
 								  : c->dst.bytes);
 
@@ -1595,7 +1591,7 @@ special_insn:
 						   ctxt->es_base,
 						   c->regs[VCPU_REGS_RDI]);
 		c->dst.val = c->regs[VCPU_REGS_RAX];
-		register_address_increment(c->regs[VCPU_REGS_RDI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
 				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
 							   : c->dst.bytes);
 		break;
@@ -1611,7 +1607,7 @@ special_insn:
 						 c->dst.bytes,
 						 ctxt->vcpu)) != 0)
 			goto done;
-		register_address_increment(c->regs[VCPU_REGS_RSI],
+		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
 				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
 							   : c->dst.bytes);
 		break;
@@ -1650,14 +1646,14 @@ special_insn:
 			goto cannot_emulate;
 		}
 		c->src.val = (unsigned long) c->eip;
-		JMP_REL(rel);
+		jmp_rel(c, rel);
 		c->op_bytes = c->ad_bytes;
 		emulate_push(ctxt);
 		break;
 	}
 	case 0xe9: /* jmp rel */
 	case 0xeb: /* jmp rel short */
-		JMP_REL(c->src.val);
+		jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
@@ -1857,7 +1853,7 @@ twobyte_insn:
 			goto cannot_emulate;
 		}
 		if (test_cc(c->b, ctxt->eflags))
-			JMP_REL(rel);
+			jmp_rel(c, rel);
 		c->dst.type = OP_NONE;
 		break;
 	}
-- 
cgit v1.2.3


From f725230af9ea03f6cc6f4a90e87aa428df46ec19 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Feb 2008 11:53:16 +0200
Subject: KVM: Add API to retrieve the number of supported vcpus per vm

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 256c0fc92b67..955d2eeac964 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -811,6 +811,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 		break;
+	case KVM_CAP_NR_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
 	default:
 		r = 0;
 		break;
-- 
cgit v1.2.3


From a988b910ef816ed57e1cecbec14e98e906453f91 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Feb 2008 11:59:20 +0200
Subject: KVM: Add API for determining the number of supported memory slots

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 955d2eeac964..b7c32f63671d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -814,6 +814,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NR_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_NR_MEMSLOTS:
+		r = KVM_MEMORY_SLOTS;
+		break;
 	default:
 		r = 0;
 		break;
-- 
cgit v1.2.3


From a5f61300c489e334ddf99781a13a7f8d4b580781 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Feb 2008 17:57:21 +0200
Subject: KVM: Use x86's segment descriptor struct instead of private
 definition

The x86 desc_struct unification allows us to remove segment_descriptor.h.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/segment_descriptor.h | 29 -----------------------------
 arch/x86/kvm/vmx.c                |  3 +--
 arch/x86/kvm/x86.c                | 15 +++++++--------
 3 files changed, 8 insertions(+), 39 deletions(-)
 delete mode 100644 arch/x86/kvm/segment_descriptor.h

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c873389..000000000000
--- a/arch/x86/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-
-struct segment_descriptor {
-	u16 limit_low;
-	u16 base_low;
-	u8  base_mid;
-	u8  type : 4;
-	u8  system : 1;
-	u8  dpl : 2;
-	u8  present : 1;
-	u8  limit_high : 4;
-	u8  avl : 1;
-	u8  long_mode : 1;
-	u8  default_op : 1;
-	u8  granularity : 1;
-	u8  base_high;
-} __attribute__((packed));
-
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
-	struct segment_descriptor s;
-	u32 base_higher;
-	u32 pad_zero;
-};
-
-#endif
-#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2d5ccec3f9e7..f46ad03c2521 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -17,7 +17,6 @@
 
 #include "irq.h"
 #include "vmx.h"
-#include "segment_descriptor.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -388,7 +387,7 @@ static void reload_tss(void)
 	 * VT restores TR but not its size.  Useless.
 	 */
 	struct descriptor_table gdt;
-	struct segment_descriptor *descs;
+	struct desc_struct *descs;
 
 	get_gdt(&gdt);
 	descs = (void *)gdt.base;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b7c32f63671d..a063f449a12e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include "segment_descriptor.h"
 #include "irq.h"
 #include "mmu.h"
 
@@ -29,6 +28,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
+#include <asm/desc.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -94,7 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
-	struct segment_descriptor *d;
+	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
 
@@ -110,13 +110,12 @@ unsigned long segment_base(u16 selector)
 		asm("sldt %0" : "=g"(ldt_selector));
 		table_base = segment_base(ldt_selector);
 	}
-	d = (struct segment_descriptor *)(table_base + (selector & ~7));
-	v = d->base_low | ((unsigned long)d->base_mid << 16) |
-		((unsigned long)d->base_high << 24);
+	d = (struct desc_struct *)(table_base + (selector & ~7));
+	v = d->base0 | ((unsigned long)d->base1 << 16) |
+		((unsigned long)d->base2 << 24);
 #ifdef CONFIG_X86_64
-	if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((unsigned long) \
-		      ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 #endif
 	return v;
 }
-- 
cgit v1.2.3


From f11c3a8d84d7bf091bf963edd7104dd4ba6416c3 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Thu, 21 Feb 2008 01:00:30 +0530
Subject: KVM: Add stat counter for hypercalls

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a063f449a12e..15bba5d37931 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "irq_window", VCPU_STAT(irq_window_exits) },
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -2405,6 +2406,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
 	kvm_x86_ops->decache_regs(vcpu);
+	++vcpu->stat.hypercalls;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-- 
cgit v1.2.3


From 77cd337f2246ae72915538383e8f5a6b7ffb363d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 19 Feb 2008 10:43:11 -0800
Subject: KVM: x86 emulator: fix sparse warnings in x86_emulate.c

Nesting __emulate_2op_nobyte inside__emulate_2op produces many shadowed
variable warnings on the internal variable _tmp used by both macros.

Change the outer macro to use __tmp.

Avoids a sparse warning like the following at every call site of __emulate_2op
arch/x86/kvm/x86_emulate.c:1091:3: warning: symbol '_tmp' shadows an earlier one
arch/x86/kvm/x86_emulate.c:1091:3: originally declared here
[18 more warnings suppressed]

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index cacdcf5f32d7..f59ed93f5d24 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -371,7 +371,7 @@ static u16 group2_table[] = {
 
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
 	do {								     \
-		unsigned long _tmp;					     \
+		unsigned long __tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
 			__asm__ __volatile__ (				     \
@@ -379,7 +379,7 @@ static u16 group2_table[] = {
 				_op"b %"_bx"3,%1; "			     \
 				_POST_EFLAGS("0", "4", "2")		     \
 				: "=m" (_eflags), "=m" ((_dst).val),	     \
-				  "=&r" (_tmp)				     \
+				  "=&r" (__tmp)				     \
 				: _by ((_src).val), "i" (EFLAGS_MASK));      \
 			break;						     \
 		default:						     \
-- 
cgit v1.2.3


From 4866d5e3d59c7831c7fa117c246a39165817db0d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 19 Feb 2008 10:32:02 -0800
Subject: KVM: SVM: make iopm_base static

Fixes sparse warning as well.
arch/x86/kvm/svm.c:69:15: warning: symbol 'iopm_base' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7d73e935dcc1..ff6e5c8da3c6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -66,7 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
-unsigned long iopm_base;
+static unsigned long iopm_base;
 
 struct kvm_ldttss_desc {
 	u16 limit0;
-- 
cgit v1.2.3


From 14af3f3c56103d8c3bb173c255ef5d89fb0c9350 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 19 Feb 2008 10:25:50 -0800
Subject: KVM: sparse fixes for kvm/x86.c

In two case statements, use the ever popular 'i' instead of index:
arch/x86/kvm/x86.c:1063:7: warning: symbol 'index' shadows an earlier one
arch/x86/kvm/x86.c:1000:9: originally declared here
arch/x86/kvm/x86.c:1079:7: warning: symbol 'index' shadows an earlier one
arch/x86/kvm/x86.c:1000:9: originally declared here

Make it static.
arch/x86/kvm/x86.c:1945:24: warning: symbol 'emulate_ops' was not declared. Should it be static?

Drop the return statements.
arch/x86/kvm/x86.c:2878:2: warning: returning void-valued expression
arch/x86/kvm/x86.c:2944:2: warning: returning void-valued expression

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15bba5d37931..cf6261e3d928 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1083,32 +1083,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	}
 	/* function 4 and 0xb have additional index. */
 	case 4: {
-		int index, cache_type;
+		int i, cache_type;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until cache_type is zero */
-		for (index = 1; *nent < maxnent; ++index) {
-			cache_type = entry[index - 1].eax & 0x1f;
+		for (i = 1; *nent < maxnent; ++i) {
+			cache_type = entry[i - 1].eax & 0x1f;
 			if (!cache_type)
 				break;
-			do_cpuid_1_ent(&entry[index], function, index);
-			entry[index].flags |=
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 			++*nent;
 		}
 		break;
 	}
 	case 0xb: {
-		int index, level_type;
+		int i, level_type;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until level_type is zero */
-		for (index = 1; *nent < maxnent; ++index) {
-			level_type = entry[index - 1].ecx & 0xff;
+		for (i = 1; *nent < maxnent; ++i) {
+			level_type = entry[i - 1].ecx & 0xff;
 			if (!level_type)
 				break;
-			do_cpuid_1_ent(&entry[index], function, index);
-			entry[index].flags |=
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 			++*nent;
 		}
@@ -1965,7 +1965,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
-struct x86_emulate_ops emulate_ops = {
+static struct x86_emulate_ops emulate_ops = {
 	.read_std            = emulator_read_std,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
@@ -2899,7 +2899,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 static void get_segment(struct kvm_vcpu *vcpu,
 			struct kvm_segment *var, int seg)
 {
-	return kvm_x86_ops->get_segment(vcpu, var, seg);
+	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2965,7 +2965,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 static void set_segment(struct kvm_vcpu *vcpu,
 			struct kvm_segment *var, int seg)
 {
-	return kvm_x86_ops->set_segment(vcpu, var, seg);
+	kvm_x86_ops->set_segment(vcpu, var, seg);
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 847f0ad8cbfa70c1af6948025836dfbd9ed6da1e Mon Sep 17 00:00:00 2001
From: Alexander Graf <alex@csgraf.de>
Date: Thu, 21 Feb 2008 12:11:01 +0100
Subject: KVM: Implement dummy values for MSR_PERF_STATUS

Darwin relies on this and ceases to work without.

Signed-off-by: Alexander Graf <alex@csgraf.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf6261e3d928..0dd038e7392b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -426,6 +426,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_IA32_PERF_STATUS,
 };
 
 static unsigned num_msrs_to_save;
@@ -653,7 +654,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_MISC+12:
 	case MSR_IA32_MC0_MISC+16:
 	case MSR_IA32_UCODE_REV:
-	case MSR_IA32_PERF_STATUS:
 	case MSR_IA32_EBL_CR_POWERON:
 		/* MTRR registers */
 	case 0xfe:
@@ -669,6 +669,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;
+	case MSR_IA32_PERF_STATUS:
+		/* TSC increment by tick */
+		data = 1000ULL;
+		/* CPU multiplier */
+		data |= (((uint64_t)4ULL) << 40);
+		break;
 	case MSR_EFER:
 		data = vcpu->arch.shadow_efer;
 		break;
-- 
cgit v1.2.3


From 2e53d63acba75795aa226febd140f67c58c6a353 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 20 Feb 2008 14:47:24 -0500
Subject: KVM: MMU: ignore zapped root pagetables

Mark zapped root pagetables as invalid and ignore such pages during lookup.

This is a problem with the cr3-target feature, where a zapped root table fools
the faulting code into creating a read-only mapping. The result is a lockup
if the instruction can't be emulated.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++++++--
 arch/x86/kvm/x86.c | 12 ++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f7541fe22cd8..103d008dab8b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -667,7 +667,8 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
+		if (sp->gfn == gfn && !sp->role.metaphysical
+		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
 				 __FUNCTION__, sp->role.word);
 			return sp;
@@ -792,8 +793,11 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->root_count) {
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
-	} else
+	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
+		sp->role.invalid = 1;
+		kvm_reload_remote_mmus(kvm);
+	}
 	kvm_mmu_reset_last_pte_updated(kvm);
 }
 
@@ -1073,6 +1077,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 
 		sp = page_header(root);
 		--sp->root_count;
+		if (!sp->root_count && sp->role.invalid)
+			kvm_mmu_zap_page(vcpu->kvm, sp);
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
@@ -1085,6 +1091,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
 			--sp->root_count;
+			if (!sp->root_count && sp->role.invalid)
+				kvm_mmu_zap_page(vcpu->kvm, sp);
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0dd038e7392b..e8e64927bddc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2658,6 +2658,10 @@ preempted:
 		kvm_x86_ops->guest_debug_pre(vcpu);
 
 again:
+	if (vcpu->requests)
+		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+			kvm_mmu_unload(vcpu);
+
 	r = kvm_mmu_reload(vcpu);
 	if (unlikely(r))
 		goto out;
@@ -2689,6 +2693,14 @@ again:
 		goto out;
 	}
 
+	if (vcpu->requests)
+		if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
+			local_irq_enable();
+			preempt_enable();
+			r = 1;
+			goto out;
+		}
+
 	if (signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
-- 
cgit v1.2.3


From 05da45583de9b383dc81dd695fe248431d6c9f2b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo@kvack.org>
Date: Sat, 23 Feb 2008 11:44:30 -0300
Subject: KVM: MMU: large page support

Create large pages mappings if the guest PTE's are marked as such and
the underlying memory is hugetlbfs backed.  If the largepage contains
write-protected pages, a large pte is not used.

Gives a consistent 2% improvement for data copies on ram mounted
filesystem, without NPT/EPT.

Anthony measures a 4% improvement on 4-way kernbench, with NPT.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 222 ++++++++++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/paging_tmpl.h |  32 +++++--
 arch/x86/kvm/x86.c         |   1 +
 3 files changed, 224 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 103d008dab8b..1932a3aeda1d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/hugetlb.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte)
 		&& pte != shadow_notrap_nonpresent_pte;
 }
 
+static int is_large_pte(u64 pte)
+{
+	return pte & PT_PAGE_SIZE_MASK;
+}
+
 static int is_writeble_pte(unsigned long pte)
 {
 	return pte & PT_WRITABLE_MASK;
@@ -349,17 +355,101 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 	kfree(rd);
 }
 
+/*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+	unsigned long idx;
+
+	idx = (gfn / KVM_PAGES_PER_HPAGE) -
+	      (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+	return &slot->lpage_info[idx].write_count;
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+	int *write_count;
+
+	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	*write_count += 1;
+	WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+	int *write_count;
+
+	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	*write_count -= 1;
+	WARN_ON(*write_count < 0);
+}
+
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	int *largepage_idx;
+
+	if (slot) {
+		largepage_idx = slot_largepage_idx(gfn, slot);
+		return *largepage_idx;
+	}
+
+	return 1;
+}
+
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+	struct vm_area_struct *vma;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return 0;
+
+	vma = find_vma(current->mm, addr);
+	if (vma && is_vm_hugetlb_page(vma))
+		return 1;
+
+	return 0;
+}
+
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	if (has_wrprotected_page(vcpu->kvm, large_gfn))
+		return 0;
+
+	if (!host_largepage_backed(vcpu->kvm, large_gfn))
+		return 0;
+
+	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+	if (slot && slot->dirty_bitmap)
+		return 0;
+
+	return 1;
+}
+
 /*
  * Take gfn and return the reverse mapping to it.
  * Note: gfn must be unaliased before this function get called
  */
 
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 {
 	struct kvm_memory_slot *slot;
+	unsigned long idx;
 
 	slot = gfn_to_memslot(kvm, gfn);
-	return &slot->rmap[gfn - slot->base_gfn];
+	if (!lpage)
+		return &slot->rmap[gfn - slot->base_gfn];
+
+	idx = (gfn / KVM_PAGES_PER_HPAGE) -
+	      (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+
+	return &slot->lpage_info[idx].rmap_pde;
 }
 
 /*
@@ -371,7 +461,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
  * containing more mappings.
  */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 {
 	struct kvm_mmu_page *sp;
 	struct kvm_rmap_desc *desc;
@@ -383,7 +473,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	sp->gfns[spte - sp->spt] = gfn;
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
 	if (!*rmapp) {
 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 		*rmapp = (unsigned long)spte;
@@ -449,7 +539,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		kvm_release_page_dirty(page);
 	else
 		kvm_release_page_clean(page);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 		BUG();
@@ -515,7 +605,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	int write_protected = 0;
 
 	gfn = unalias_gfn(kvm, gfn);
-	rmapp = gfn_to_rmap(kvm, gfn);
+	rmapp = gfn_to_rmap(kvm, gfn, 0);
 
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
@@ -528,8 +618,27 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
+	/* check for huge page mappings */
+	rmapp = gfn_to_rmap(kvm, gfn, 1);
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!spte);
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+		pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+		if (is_writeble_pte(*spte)) {
+			rmap_remove(kvm, spte);
+			--kvm->stat.lpages;
+			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+			write_protected = 1;
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+
 	if (write_protected)
 		kvm_flush_remote_tlbs(kvm);
+
+	account_shadowed(kvm, gfn);
 }
 
 #ifdef MMU_DEBUG
@@ -747,11 +856,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
 		ent = pt[i];
 
+		if (is_shadow_present_pte(ent)) {
+			if (!is_large_pte(ent)) {
+				ent &= PT64_BASE_ADDR_MASK;
+				mmu_page_remove_parent_pte(page_header(ent),
+							   &pt[i]);
+			} else {
+				--kvm->stat.lpages;
+				rmap_remove(kvm, &pt[i]);
+			}
+		}
 		pt[i] = shadow_trap_nonpresent_pte;
-		if (!is_shadow_present_pte(ent))
-			continue;
-		ent &= PT64_BASE_ADDR_MASK;
-		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
 	}
 	kvm_flush_remote_tlbs(kvm);
 }
@@ -791,6 +906,8 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 	kvm_mmu_page_unlink_children(kvm, sp);
 	if (!sp->root_count) {
+		if (!sp->role.metaphysical)
+			unaccount_shadowed(kvm, sp->gfn);
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
 	} else {
@@ -894,7 +1011,8 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, gfn_t gfn, struct page *page)
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 struct page *page)
 {
 	u64 spte;
 	int was_rmapped = 0;
@@ -907,15 +1025,29 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		 write_fault, user_fault, gfn);
 
 	if (is_rmap_pte(*shadow_pte)) {
-		if (host_pfn != page_to_pfn(page)) {
+		/*
+		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+		 * the parent of the now unreachable PTE.
+		 */
+		if (largepage && !is_large_pte(*shadow_pte)) {
+			struct kvm_mmu_page *child;
+			u64 pte = *shadow_pte;
+
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, shadow_pte);
+		} else if (host_pfn != page_to_pfn(page)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 host_pfn, page_to_pfn(page));
 			rmap_remove(vcpu->kvm, shadow_pte);
+		} else {
+			if (largepage)
+				was_rmapped = is_large_pte(*shadow_pte);
+			else
+				was_rmapped = 1;
 		}
-		else
-			was_rmapped = 1;
 	}
 
+
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -930,6 +1062,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	spte |= PT_PRESENT_MASK;
 	if (pte_access & ACC_USER_MASK)
 		spte |= PT_USER_MASK;
+	if (largepage)
+		spte |= PT_PAGE_SIZE_MASK;
 
 	spte |= page_to_phys(page);
 
@@ -944,7 +1078,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 
 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow) {
+		if (shadow ||
+		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __FUNCTION__, gfn);
 			pte_access &= ~ACC_WRITE_MASK;
@@ -963,10 +1098,17 @@ unshadowed:
 		mark_page_dirty(vcpu->kvm, gfn);
 
 	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+	pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
+		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
 	set_shadow_pte(shadow_pte, spte);
+	if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+	    && (spte & PT_PRESENT_MASK))
+		++vcpu->kvm->stat.lpages;
+
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
 	if (!was_rmapped) {
-		rmap_add(vcpu, shadow_pte, gfn);
+		rmap_add(vcpu, shadow_pte, gfn, largepage);
 		if (!is_rmap_pte(*shadow_pte))
 			kvm_release_page_clean(page);
 	} else {
@@ -984,7 +1126,8 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			   gfn_t gfn, struct page *page, int level)
+			   int largepage, gfn_t gfn, struct page *page,
+			   int level)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
@@ -998,7 +1141,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, gfn, page);
+				     0, write, 1, &pt_write, 0, gfn, page);
+			return pt_write;
+		}
+
+		if (largepage && level == 2) {
+			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+				    0, write, 1, &pt_write, 1, gfn, page);
 			return pt_write;
 		}
 
@@ -1027,12 +1176,18 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 	int r;
+	int largepage = 0;
 
 	struct page *page;
 
 	down_read(&vcpu->kvm->slots_lock);
 
 	down_read(&current->mm->mmap_sem);
+	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+		largepage = 1;
+	}
+
 	page = gfn_to_page(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -1045,7 +1200,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL);
+	r = __direct_map(vcpu, v, write, largepage, gfn, page,
+			 PT32E_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	up_read(&vcpu->kvm->slots_lock);
@@ -1180,6 +1336,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 {
 	struct page *page;
 	int r;
+	int largepage = 0;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1189,7 +1347,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		return r;
 
 	down_read(&current->mm->mmap_sem);
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+		largepage = 1;
+	}
+	page = gfn_to_page(vcpu->kvm, gfn);
 	if (is_error_page(page)) {
 		kvm_release_page_clean(page);
 		up_read(&current->mm->mmap_sem);
@@ -1198,7 +1360,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL);
+			 largepage, gfn, page, TDP_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	up_read(&current->mm->mmap_sem);
 
@@ -1397,7 +1559,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
-		if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+		if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+		    is_large_pte(pte))
 			rmap_remove(vcpu->kvm, spte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1405,6 +1568,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 		}
 	}
 	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	if (is_large_pte(pte))
+		--vcpu->kvm->stat.lpages;
 }
 
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
@@ -1412,7 +1577,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  u64 *spte,
 				  const void *new)
 {
-	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+	if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+	    && !vcpu->arch.update_pte.largepage) {
 		++vcpu->kvm->stat.mmu_pde_zapped;
 		return;
 	}
@@ -1460,6 +1626,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	u64 gpte = 0;
 	struct page *page;
 
+	vcpu->arch.update_pte.largepage = 0;
+
 	if (bytes != 4 && bytes != 8)
 		return;
 
@@ -1487,9 +1655,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-	down_read(&vcpu->kvm->slots_lock);
+	down_read(&current->mm->mmap_sem);
+	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+		vcpu->arch.update_pte.largepage = 1;
+	}
 	page = gfn_to_page(vcpu->kvm, gfn);
-	up_read(&vcpu->kvm->slots_lock);
+	up_read(&current->mm->mmap_sem);
 
 	if (is_error_page(page)) {
 		kvm_release_page_clean(page);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4b55f462e2b3..17f9d160ca34 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -248,6 +248,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pt_element_t gpte;
 	unsigned pte_access;
 	struct page *npage;
+	int largepage = vcpu->arch.update_pte.largepage;
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -264,7 +265,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 	get_page(npage);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+		     npage);
 }
 
 /*
@@ -272,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
  */
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *walker,
-			 int user_fault, int write_fault, int *ptwrite,
-			 struct page *page)
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, struct page *page)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -301,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
 		if (level == PT_PAGE_TABLE_LEVEL)
 			break;
-		if (is_shadow_present_pte(*shadow_ent)) {
+
+		if (largepage && level == PT_DIRECTORY_LEVEL)
+			break;
+
+		if (is_shadow_present_pte(*shadow_ent)
+		    && !is_large_pte(*shadow_ent)) {
 			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
 			continue;
 		}
 
+		if (is_large_pte(*shadow_ent))
+			rmap_remove(vcpu->kvm, shadow_ent);
+
 		if (level - 1 == PT_PAGE_TABLE_LEVEL
 		    && walker->level == PT_DIRECTORY_LEVEL) {
 			metaphysical = 1;
@@ -339,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, walker->gfn, page);
+		     ptwrite, largepage, walker->gfn, page);
 
 	return shadow_ent;
 }
@@ -369,6 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int write_pt = 0;
 	int r;
 	struct page *page;
+	int largepage = 0;
 
 	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -396,6 +407,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	down_read(&current->mm->mmap_sem);
+	if (walker.level == PT_DIRECTORY_LEVEL) {
+		gfn_t large_gfn;
+		large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+		if (is_largepage_backed(vcpu, large_gfn)) {
+			walker.gfn = large_gfn;
+			largepage = 1;
+		}
+	}
 	page = gfn_to_page(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -410,7 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  &write_pt, page);
+				  largepage, &write_pt, page);
+
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
 		 shadow_pte, *shadow_pte, write_pt);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8e64927bddc..0458bd516185 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
 };
 
-- 
cgit v1.2.3


From 2d3ad1f40c841bd3e97d30d423eea53915d085dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 24 Feb 2008 11:20:43 +0200
Subject: KVM: Prefix control register accessors with kvm_ to avoid namespace
 pollution

Names like 'set_cr3()' look dangerously close to affecting the host.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 14 +++++++-------
 arch/x86/kvm/x86.c | 46 +++++++++++++++++++++++-----------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f46ad03c2521..50345032974d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1683,7 +1683,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->vcpu.arch.rmode.active = 0;
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	set_cr8(&vmx->vcpu, 0);
+	kvm_set_cr8(&vmx->vcpu, 0);
 	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (vmx->vcpu.vcpu_id == 0)
 		msr |= MSR_IA32_APICBASE_BSP;
@@ -2026,22 +2026,22 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		switch (cr) {
 		case 0:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr0(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr3(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr4(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			set_cr8(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
 			skip_emulated_instruction(vcpu);
 			if (irqchip_in_kernel(vcpu->kvm))
 				return 1;
@@ -2067,14 +2067,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = get_cr8(vcpu);
+			vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
 			vcpu_put_rsp_rip(vcpu);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		}
 		break;
 	case 3: /* lmsw */
-		lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+		kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
 
 		skip_emulated_instruction(vcpu);
 		return 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0458bd516185..dbcff38dfcc3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -237,7 +237,7 @@ out:
 	return changed;
 }
 
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	if (cr0 & CR0_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
@@ -295,15 +295,15 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
-EXPORT_SYMBOL_GPL(set_cr0);
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 }
-EXPORT_SYMBOL_GPL(lmsw);
+EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	if (cr4 & CR4_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
@@ -334,9 +334,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	vcpu->arch.cr4 = cr4;
 	kvm_mmu_reset_context(vcpu);
 }
-EXPORT_SYMBOL_GPL(set_cr4);
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_flush_tlb(vcpu);
@@ -388,9 +388,9 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 	up_read(&vcpu->kvm->slots_lock);
 }
-EXPORT_SYMBOL_GPL(set_cr3);
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS) {
 		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
@@ -402,16 +402,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 	else
 		vcpu->arch.cr8 = cr8;
 }
-EXPORT_SYMBOL_GPL(set_cr8);
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
 }
-EXPORT_SYMBOL_GPL(get_cr8);
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -2462,7 +2462,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 		   unsigned long *rflags)
 {
-	lmsw(vcpu, msw);
+	kvm_lmsw(vcpu, msw);
 	*rflags = kvm_x86_ops->get_rflags(vcpu);
 }
 
@@ -2479,7 +2479,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 	case 4:
 		return vcpu->arch.cr4;
 	case 8:
-		return get_cr8(vcpu);
+		return kvm_get_cr8(vcpu);
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
 		return 0;
@@ -2491,20 +2491,20 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
 {
 	switch (cr) {
 	case 0:
-		set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+		kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
 		*rflags = kvm_x86_ops->get_rflags(vcpu);
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
-		set_cr3(vcpu, val);
+		kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
-		set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+		kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
 		break;
 	case 8:
-		set_cr8(vcpu, val & 0xfUL);
+		kvm_set_cr8(vcpu, val & 0xfUL);
 		break;
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
@@ -2602,7 +2602,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = get_cr8(vcpu);
+	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
@@ -2803,7 +2803,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	/* re-sync apic's tpr */
 	if (!irqchip_in_kernel(vcpu->kvm))
-		set_cr8(vcpu, kvm_run->cr8);
+		kvm_set_cr8(vcpu, kvm_run->cr8);
 
 	if (vcpu->arch.pio.cur_count) {
 		r = complete_pio(vcpu);
@@ -2961,7 +2961,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->cr2 = vcpu->arch.cr2;
 	sregs->cr3 = vcpu->arch.cr3;
 	sregs->cr4 = vcpu->arch.cr4;
-	sregs->cr8 = get_cr8(vcpu);
+	sregs->cr8 = kvm_get_cr8(vcpu);
 	sregs->efer = vcpu->arch.shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
@@ -3007,7 +3007,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
 	vcpu->arch.cr3 = sregs->cr3;
 
-	set_cr8(vcpu, sregs->cr8);
+	kvm_set_cr8(vcpu, sregs->cr8);
 
 	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
-- 
cgit v1.2.3


From 71c4dfafc0932d92cc99c7e839d25174b0ce10a1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 26 Feb 2008 16:49:16 +0100
Subject: KVM: detect if VCPU triple faults

In the current inject_page_fault path KVM only checks if there is another PF
pending and injects a DF then. But it has to check for a pending DF too to
detect a shutdown condition in the VCPU.  If this is not detected the VCPU goes
to a PF -> DF -> PF loop when it should triple fault. This patch detects this
condition and handles it with an KVM_SHUTDOWN exit to userspace.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dbcff38dfcc3..491eda308289 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -155,11 +155,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
 	++vcpu->stat.pf_guest;
-	if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
-		printk(KERN_DEBUG "kvm: inject_page_fault:"
-		       " double fault 0x%lx\n", addr);
-		vcpu->arch.exception.nr = DF_VECTOR;
-		vcpu->arch.exception.error_code = 0;
+	if (vcpu->arch.exception.pending) {
+		if (vcpu->arch.exception.nr == PF_VECTOR) {
+			printk(KERN_DEBUG "kvm: inject_page_fault:"
+					" double fault 0x%lx\n", addr);
+			vcpu->arch.exception.nr = DF_VECTOR;
+			vcpu->arch.exception.error_code = 0;
+		} else if (vcpu->arch.exception.nr == DF_VECTOR) {
+			/* triple fault -> shutdown */
+			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		}
 		return;
 	}
 	vcpu->arch.cr2 = addr;
@@ -2676,6 +2681,11 @@ again:
 			r = 0;
 			goto out;
 		}
+		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+			kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+			r = 0;
+			goto out;
+		}
 	}
 
 	kvm_inject_pending_timer_irqs(vcpu);
-- 
cgit v1.2.3


From b8688d51bbe4872fbcec751e04369606082ac610 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 3 Mar 2008 12:59:56 -0800
Subject: KVM: replace remaining __FUNCTION__ occurances

__FUNCTION__ is gcc-specific, use __func__

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/lapic.c       |  8 ++++----
 arch/x86/kvm/mmu.c         | 35 +++++++++++++++++------------------
 arch/x86/kvm/paging_tmpl.h | 14 +++++++-------
 arch/x86/kvm/svm.c         | 14 +++++++-------
 arch/x86/kvm/vmx.c         |  6 +++---
 arch/x86/kvm/x86.c         | 12 ++++++------
 6 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 68a6b1511934..31280df7d2e3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
 			   "timer initial count 0x%x, period %lldns, "
-			   "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+			   "expire @ 0x%016" PRIx64 ".\n", __func__,
 			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
 			   apic_get_reg(apic, APIC_TMICT),
 			   apic->timer.period,
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 	/* too common printing */
 	if (offset != APIC_EOI)
 		apic_debug("%s: offset 0x%x with length 0x%x, and value is "
-			   "0x%x\n", __FUNCTION__, offset, len, val);
+			   "0x%x\n", __func__, offset, len, val);
 
 	offset &= 0xff0;
 
@@ -869,7 +869,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic;
 	int i;
 
-	apic_debug("%s\n", __FUNCTION__);
+	apic_debug("%s\n", __func__);
 
 	ASSERT(vcpu);
 	apic = vcpu->arch.apic;
@@ -907,7 +907,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	apic_update_ppr(apic);
 
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
-		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
 		   vcpu, kvm_apic_id(apic),
 		   vcpu->arch.apic_base, apic->base_address);
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1932a3aeda1d..414405b6ec13 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -649,7 +649,7 @@ static int is_empty_shadow_page(u64 *spt)
 
 	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
 		if (*pos != shadow_trap_nonpresent_pte) {
-			printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+			printk(KERN_ERR "%s: %p %llx\n", __func__,
 			       pos, *pos);
 			return 0;
 		}
@@ -772,14 +772,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node;
 
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.metaphysical
 		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
-				 __FUNCTION__, sp->role.word);
+				 __func__, sp->role.word);
 			return sp;
 		}
 	return NULL;
@@ -810,21 +810,21 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+	pgprintk("%s: looking gfn %lx role %x\n", __func__,
 		 gfn, role.word);
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
 		if (sp->gfn == gfn && sp->role.word == role.word) {
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-			pgprintk("%s: found\n", __FUNCTION__);
+			pgprintk("%s: found\n", __func__);
 			return sp;
 		}
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 	if (!sp)
 		return sp;
-	pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
@@ -960,13 +960,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	struct hlist_node *node, *n;
 	int r;
 
-	pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
-			pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			kvm_mmu_zap_page(kvm, sp);
 			r = 1;
@@ -979,7 +979,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mmu_page *sp;
 
 	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+		pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
 		kvm_mmu_zap_page(kvm, sp);
 	}
 }
@@ -1021,7 +1021,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
 		 " user_fault %d gfn %lx\n",
-		 __FUNCTION__, *shadow_pte, pt_access,
+		 __func__, *shadow_pte, pt_access,
 		 write_fault, user_fault, gfn);
 
 	if (is_rmap_pte(*shadow_pte)) {
@@ -1047,7 +1047,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 
-
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1081,7 +1080,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		if (shadow ||
 		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
-				 __FUNCTION__, gfn);
+				 __func__, gfn);
 			pte_access &= ~ACC_WRITE_MASK;
 			if (is_writeble_pte(spte)) {
 				spte &= ~PT_WRITABLE_MASK;
@@ -1097,7 +1096,7 @@ unshadowed:
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+	pgprintk("%s: setting spte %llx\n", __func__, spte);
 	pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
 		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
 		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
@@ -1317,7 +1316,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn_t gfn;
 	int r;
 
-	pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -1395,7 +1394,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
-	pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
+	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
 	mmu_free_roots(vcpu);
 }
 
@@ -1691,7 +1690,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int r;
 
-	pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
@@ -2139,7 +2138,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
 
 	if (n_rmap != n_actual)
 		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-		       __FUNCTION__, audit_msg, n_rmap, n_actual);
+		       __func__, audit_msg, n_rmap, n_actual);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -2159,7 +2158,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (*rmapp)
 			printk(KERN_ERR "%s: (%s) shadow page has writable"
 			       " mappings: gfn %lx role %x\n",
-			       __FUNCTION__, audit_msg, sp->gfn,
+			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
 	}
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d160ca34..57abbd091143 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
 
-	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+	pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
@@ -155,7 +155,7 @@ walk:
 		pte_gpa += index * sizeof(pt_element_t);
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
-		pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+		pgprintk("%s: table_gfn[%d] %lx\n", __func__,
 			 walker->level - 1, table_gfn);
 
 		kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
@@ -222,7 +222,7 @@ walk:
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __FUNCTION__, (u64)pte, pt_access, pte_access);
+		 __func__, (u64)pte, pt_access, pte_access);
 	return 1;
 
 not_present:
@@ -256,7 +256,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 			set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
 		return;
 	}
-	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 		return;
@@ -381,7 +381,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	struct page *page;
 	int largepage = 0;
 
-	pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
 
 	r = mmu_topup_memory_caches(vcpu);
@@ -399,7 +399,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	 * The page is not mapped by the guest.  Let the guest handle it.
 	 */
 	if (!r) {
-		pgprintk("%s: guest page fault\n", __FUNCTION__);
+		pgprintk("%s: guest page fault\n", __func__);
 		inject_page_fault(vcpu, addr, walker.error_code);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		up_read(&vcpu->kvm->slots_lock);
@@ -431,7 +431,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  largepage, &write_pt, page);
 
-	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 shadow_pte, *shadow_pte, write_pt);
 
 	if (!write_pt)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff6e5c8da3c6..b2c667fe6832 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -230,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if (!svm->next_rip) {
-		printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+		printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
 	}
 	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
 		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-		       __FUNCTION__,
+		       __func__,
 		       svm->vmcb->save.rip,
 		       svm->next_rip);
 
@@ -996,7 +996,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 	}
 	default:
 		printk(KERN_DEBUG "%s: unexpected dr %u\n",
-		       __FUNCTION__, dr);
+		       __func__, dr);
 		*exception = UD_VECTOR;
 		return;
 	}
@@ -1109,7 +1109,7 @@ static int invalid_op_interception(struct vcpu_svm *svm,
 static int task_switch_interception(struct vcpu_svm *svm,
 				    struct kvm_run *kvm_run)
 {
-	pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
+	pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __func__);
 	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
 	return 0;
 }
@@ -1125,7 +1125,7 @@ static int emulate_on_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
 	if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
 	return 1;
 }
 
@@ -1257,7 +1257,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!svm_has(SVM_FEATURE_LBRV)) {
 			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
-					__FUNCTION__, data);
+					__func__, data);
 			break;
 		}
 		if (data & DEBUGCTL_RESERVED_BITS)
@@ -1419,7 +1419,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	    exit_code != SVM_EXIT_NPF)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
-		       __FUNCTION__, svm->vmcb->control.exit_int_info,
+		       __func__, svm->vmcb->control.exit_int_info,
 		       exit_code);
 
 	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 50345032974d..7ef710afceba 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1254,7 +1254,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
 	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
 		printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
-		       __FUNCTION__);
+		       __func__);
 		vmcs_write32(GUEST_TR_AR_BYTES,
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
@@ -1909,7 +1909,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
 						!is_page_fault(intr_info))
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-		       "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+		       "intr info 0x%x\n", __func__, vect_info, intr_info);
 
 	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
@@ -2275,7 +2275,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 				exit_reason != EXIT_REASON_EXCEPTION_NMI)
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-		       "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+		       "exit reason is 0x%x\n", __func__, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 491eda308289..bf78d6522d3d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -563,15 +563,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_IA32_MC0_STATUS:
 		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-		       __FUNCTION__, data);
+		       __func__, data);
 		break;
 	case MSR_IA32_MCG_STATUS:
 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-			__FUNCTION__, data);
+			__func__, data);
 		break;
 	case MSR_IA32_MCG_CTL:
 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-			__FUNCTION__, data);
+			__func__, data);
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
@@ -1939,7 +1939,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 		*dest = kvm_x86_ops->get_dr(vcpu, dr);
 		return X86EMUL_CONTINUE;
 	default:
-		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
 		return X86EMUL_UNHANDLEABLE;
 	}
 }
@@ -2486,7 +2486,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 	case 8:
 		return kvm_get_cr8(vcpu);
 	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 		return 0;
 	}
 }
@@ -2512,7 +2512,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
 		kvm_set_cr8(vcpu, val & 0xfUL);
 		break;
 	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 	}
 }
 
-- 
cgit v1.2.3


From 019960ae9933161c2809fa4ee608ba30d9639fd2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 4 Mar 2008 10:44:51 +0200
Subject: KVM: VMX: Don't adjust tsc offset forward

Most Intel hosts have a stable tsc, and playing with the offset only
reduces accuracy.  By limiting tsc offset adjustment only to forward updates,
we effectively disable tsc offset adjustment on these hosts.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7ef710afceba..fb0389d3a8d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -519,7 +519,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 phys_addr = __pa(vmx->vmcs);
-	u64 tsc_this, delta;
+	u64 tsc_this, delta, new_offset;
 
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
@@ -559,8 +559,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * Make sure the time stamp counter is monotonous.
 		 */
 		rdtscll(tsc_this);
-		delta = vcpu->arch.host_tsc - tsc_this;
-		vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+		if (tsc_this < vcpu->arch.host_tsc) {
+			delta = vcpu->arch.host_tsc - tsc_this;
+			new_offset = vmcs_read64(TSC_OFFSET) + delta;
+			vmcs_write64(TSC_OFFSET, new_offset);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 4fcaa98267efc4d39ded9b0bc33c6b4a2f62fecd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 5 Mar 2008 09:33:44 +0200
Subject: KVM: Remove pointless desc_ptr #ifdef

The desc_struct changes left an unnecessary #ifdef; remove it.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b2c667fe6832..51741f96e7fb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -290,11 +290,7 @@ static void svm_hardware_enable(void *garbage)
 
 	struct svm_cpu_data *svm_data;
 	uint64_t efer;
-#ifdef CONFIG_X86_64
-	struct desc_ptr gdt_descr;
-#else
 	struct desc_ptr gdt_descr;
-#endif
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
 
-- 
cgit v1.2.3


From 7837699fa6d7adf81f26ab73a5f6897ea1ab9d6a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Mon, 28 Jan 2008 05:10:22 +0800
Subject: KVM: In kernel PIT model

The patch moves the PIT model from userspace to kernel, and increases
the timer accuracy greatly.

[marcelo: make last_injected_time per-guest]

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Tested-and-Acked-by: Alex Davis <alex14641@yahoo.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/Makefile |   3 +-
 arch/x86/kvm/i8254.c  | 586 ++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/i8254.h  |  61 ++++++
 arch/x86/kvm/irq.c    |   3 +
 arch/x86/kvm/x86.c    |   9 +
 5 files changed, 661 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kvm/i8254.c
 create mode 100644 arch/x86/kvm/i8254.h

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index ffdd0b310784..4d0c22e11f1a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+	i8254.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000000..c7435093bbee
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,586 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ *   Sheng Yang <sheng.yang@intel.com>
+ *   Based on QEMU and Xen.
+ */
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+static u64 muldiv64(u64 a, u32 b, u32 c)
+{
+	union {
+		u64 ll;
+		struct {
+			u32 low, high;
+		} l;
+	} u, res;
+	u64 rl, rh;
+
+	u.ll = a;
+	rl = (u64)u.l.low * (u64)b;
+	rh = (u64)u.l.high * (u64)b;
+	rh += (rl >> 32);
+	res.l.high = div64_64(rh, c);
+	res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+	return res.ll;
+}
+
+static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	switch (c->mode) {
+	default:
+	case 0:
+	case 4:
+		/* XXX: just disable/enable counting */
+		break;
+	case 1:
+	case 2:
+	case 3:
+	case 5:
+		/* Restart counting on rising edge. */
+		if (c->gate < val)
+			c->count_load_time = ktime_get();
+		break;
+	}
+
+	c->gate = val;
+}
+
+int pit_get_gate(struct kvm *kvm, int channel)
+{
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	return kvm->arch.vpit->pit_state.channels[channel].gate;
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+	s64 d, t;
+	int counter;
+
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+	switch (c->mode) {
+	case 0:
+	case 1:
+	case 4:
+	case 5:
+		counter = (c->count - d) & 0xffff;
+		break;
+	case 3:
+		/* XXX: may be incorrect for odd counts */
+		counter = c->count - (mod_64((2 * d), c->count));
+		break;
+	default:
+		counter = c->count - mod_64(d, c->count);
+		break;
+	}
+	return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+	s64 d, t;
+	int out;
+
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+	switch (c->mode) {
+	default:
+	case 0:
+		out = (d >= c->count);
+		break;
+	case 1:
+		out = (d < c->count);
+		break;
+	case 2:
+		out = ((mod_64(d, c->count) == 0) && (d != 0));
+		break;
+	case 3:
+		out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+		break;
+	case 4:
+	case 5:
+		out = (d == c->count);
+		break;
+	}
+
+	return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	if (!c->count_latched) {
+		c->latched_count = pit_get_count(kvm, channel);
+		c->count_latched = c->rw_mode;
+	}
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+	struct kvm_kpit_channel_state *c =
+		&kvm->arch.vpit->pit_state.channels[channel];
+
+	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+	if (!c->status_latched) {
+		/* TODO: Return NULL COUNT (bit 6). */
+		c->status = ((pit_get_out(kvm, channel) << 7) |
+				(c->rw_mode << 4) |
+				(c->mode << 1) |
+				c->bcd);
+		c->status_latched = 1;
+	}
+}
+
+int __pit_timer_fn(struct kvm_kpit_state *ps)
+{
+	struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
+	struct kvm_kpit_timer *pt = &ps->pit_timer;
+
+	atomic_inc(&pt->pending);
+	smp_mb__after_atomic_inc();
+	/* FIXME: handle case where the guest is in guest mode */
+	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+		vcpu0->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		wake_up_interruptible(&vcpu0->wq);
+	}
+
+	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
+	pt->scheduled = ktime_to_ns(pt->timer.expires);
+
+	return (pt->period == 0 ? 0 : 1);
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+	struct kvm_kpit_state *ps;
+	int restart_timer = 0;
+
+	ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
+
+	restart_timer = __pit_timer_fn(ps);
+
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
+static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+{
+	pr_debug("pit: execute del timer!\n");
+	hrtimer_cancel(&pt->timer);
+}
+
+static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+{
+	s64 interval;
+
+	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+	pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+
+	/* TODO The new value only affected after the retriggered */
+	hrtimer_cancel(&pt->timer);
+	pt->period = (is_period == 0) ? 0 : interval;
+	pt->timer.function = pit_timer_fn;
+	atomic_set(&pt->pending, 0);
+
+	hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+		      HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+	WARN_ON(!mutex_is_locked(&ps->lock));
+
+	pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+
+	/*
+	 * Though spec said the state of 8254 is undefined after power-up,
+	 * seems some tricky OS like Windows XP depends on IRQ0 interrupt
+	 * when booting up.
+	 * So here setting initialize rate for it, and not a specific number
+	 */
+	if (val == 0)
+		val = 0x10000;
+
+	ps->channels[channel].count_load_time = ktime_get();
+	ps->channels[channel].count = val;
+
+	if (channel != 0)
+		return;
+
+	/* Two types of timer
+	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
+	switch (ps->channels[0].mode) {
+	case 1:
+		create_pit_timer(&ps->pit_timer, val, 0);
+		break;
+	case 2:
+		create_pit_timer(&ps->pit_timer, val, 1);
+		break;
+	default:
+		destroy_pit_timer(&ps->pit_timer);
+	}
+}
+
+static void pit_ioport_write(struct kvm_io_device *this,
+			     gpa_t addr, int len, const void *data)
+{
+	struct kvm_pit *pit = (struct kvm_pit *)this->private;
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	int channel, access;
+	struct kvm_kpit_channel_state *s;
+	u32 val = *(u32 *) data;
+
+	val  &= 0xff;
+	addr &= KVM_PIT_CHANNEL_MASK;
+
+	mutex_lock(&pit_state->lock);
+
+	if (val != 0)
+		pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
+			  (unsigned int)addr, len, val);
+
+	if (addr == 3) {
+		channel = val >> 6;
+		if (channel == 3) {
+			/* Read-Back Command. */
+			for (channel = 0; channel < 3; channel++) {
+				s = &pit_state->channels[channel];
+				if (val & (2 << channel)) {
+					if (!(val & 0x20))
+						pit_latch_count(kvm, channel);
+					if (!(val & 0x10))
+						pit_latch_status(kvm, channel);
+				}
+			}
+		} else {
+			/* Select Counter <channel>. */
+			s = &pit_state->channels[channel];
+			access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+			if (access == 0) {
+				pit_latch_count(kvm, channel);
+			} else {
+				s->rw_mode = access;
+				s->read_state = access;
+				s->write_state = access;
+				s->mode = (val >> 1) & 7;
+				if (s->mode > 5)
+					s->mode -= 4;
+				s->bcd = val & 1;
+			}
+		}
+	} else {
+		/* Write Count. */
+		s = &pit_state->channels[addr];
+		switch (s->write_state) {
+		default:
+		case RW_STATE_LSB:
+			pit_load_count(kvm, addr, val);
+			break;
+		case RW_STATE_MSB:
+			pit_load_count(kvm, addr, val << 8);
+			break;
+		case RW_STATE_WORD0:
+			s->write_latch = val;
+			s->write_state = RW_STATE_WORD1;
+			break;
+		case RW_STATE_WORD1:
+			pit_load_count(kvm, addr, s->write_latch | (val << 8));
+			s->write_state = RW_STATE_WORD0;
+			break;
+		}
+	}
+
+	mutex_unlock(&pit_state->lock);
+}
+
+static void pit_ioport_read(struct kvm_io_device *this,
+			    gpa_t addr, int len, void *data)
+{
+	struct kvm_pit *pit = (struct kvm_pit *)this->private;
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	int ret, count;
+	struct kvm_kpit_channel_state *s;
+
+	addr &= KVM_PIT_CHANNEL_MASK;
+	s = &pit_state->channels[addr];
+
+	mutex_lock(&pit_state->lock);
+
+	if (s->status_latched) {
+		s->status_latched = 0;
+		ret = s->status;
+	} else if (s->count_latched) {
+		switch (s->count_latched) {
+		default:
+		case RW_STATE_LSB:
+			ret = s->latched_count & 0xff;
+			s->count_latched = 0;
+			break;
+		case RW_STATE_MSB:
+			ret = s->latched_count >> 8;
+			s->count_latched = 0;
+			break;
+		case RW_STATE_WORD0:
+			ret = s->latched_count & 0xff;
+			s->count_latched = RW_STATE_MSB;
+			break;
+		}
+	} else {
+		switch (s->read_state) {
+		default:
+		case RW_STATE_LSB:
+			count = pit_get_count(kvm, addr);
+			ret = count & 0xff;
+			break;
+		case RW_STATE_MSB:
+			count = pit_get_count(kvm, addr);
+			ret = (count >> 8) & 0xff;
+			break;
+		case RW_STATE_WORD0:
+			count = pit_get_count(kvm, addr);
+			ret = count & 0xff;
+			s->read_state = RW_STATE_WORD1;
+			break;
+		case RW_STATE_WORD1:
+			count = pit_get_count(kvm, addr);
+			ret = (count >> 8) & 0xff;
+			s->read_state = RW_STATE_WORD0;
+			break;
+		}
+	}
+
+	if (len > sizeof(ret))
+		len = sizeof(ret);
+	memcpy(data, (char *)&ret, len);
+
+	mutex_unlock(&pit_state->lock);
+}
+
+static int pit_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static void speaker_ioport_write(struct kvm_io_device *this,
+				 gpa_t addr, int len, const void *data)
+{
+	struct kvm_pit *pit = (struct kvm_pit *)this->private;
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	u32 val = *(u32 *) data;
+
+	mutex_lock(&pit_state->lock);
+	pit_state->speaker_data_on = (val >> 1) & 1;
+	pit_set_gate(kvm, 2, val & 1);
+	mutex_unlock(&pit_state->lock);
+}
+
+static void speaker_ioport_read(struct kvm_io_device *this,
+				gpa_t addr, int len, void *data)
+{
+	struct kvm_pit *pit = (struct kvm_pit *)this->private;
+	struct kvm_kpit_state *pit_state = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+	unsigned int refresh_clock;
+	int ret;
+
+	/* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+	refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+	mutex_lock(&pit_state->lock);
+	ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
+		(pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+	if (len > sizeof(ret))
+		len = sizeof(ret);
+	memcpy(data, (char *)&ret, len);
+	mutex_unlock(&pit_state->lock);
+}
+
+static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+	return (addr == KVM_SPEAKER_BASE_ADDRESS);
+}
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+{
+	int i;
+	struct kvm_pit *pit;
+	struct kvm_kpit_state *pit_state;
+	struct kvm_kpit_channel_state *c;
+
+	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+	if (!pit)
+		return NULL;
+
+	mutex_init(&pit->pit_state.lock);
+	mutex_lock(&pit->pit_state.lock);
+
+	/* Initialize PIO device */
+	pit->dev.read = pit_ioport_read;
+	pit->dev.write = pit_ioport_write;
+	pit->dev.in_range = pit_in_range;
+	pit->dev.private = pit;
+	kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+
+	pit->speaker_dev.read = speaker_ioport_read;
+	pit->speaker_dev.write = speaker_ioport_write;
+	pit->speaker_dev.in_range = speaker_in_range;
+	pit->speaker_dev.private = pit;
+	kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+
+	kvm->arch.vpit = pit;
+	pit->kvm = kvm;
+
+	pit_state = &pit->pit_state;
+	pit_state->pit = pit;
+	hrtimer_init(&pit_state->pit_timer.timer,
+		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	atomic_set(&pit_state->pit_timer.pending, 0);
+	for (i = 0; i < 3; i++) {
+		c = &pit_state->channels[i];
+		c->mode = 0xff;
+		c->gate = (i != 2);
+		pit_load_count(kvm, i, 0);
+	}
+
+	mutex_unlock(&pit->pit_state.lock);
+
+	pit->pit_state.inject_pending = 1;
+
+	return pit;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+	struct hrtimer *timer;
+
+	if (kvm->arch.vpit) {
+		mutex_lock(&kvm->arch.vpit->pit_state.lock);
+		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+		hrtimer_cancel(timer);
+		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		kfree(kvm->arch.vpit);
+	}
+}
+
+void __inject_pit_timer_intr(struct kvm *kvm)
+{
+	mutex_lock(&kvm->lock);
+	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
+	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
+	kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
+	kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+	mutex_unlock(&kvm->lock);
+}
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kpit_state *ps;
+
+	if (vcpu && pit) {
+		ps = &pit->pit_state;
+
+		/* Try to inject pending interrupts when:
+		 * 1. Pending exists
+		 * 2. Last interrupt was accepted or waited for too long time*/
+		if (atomic_read(&ps->pit_timer.pending) &&
+		    (ps->inject_pending ||
+		    (jiffies - ps->last_injected_time
+				>= KVM_MAX_PIT_INTR_INTERVAL))) {
+			ps->inject_pending = 0;
+			__inject_pit_timer_intr(kvm);
+			ps->last_injected_time = jiffies;
+		}
+	}
+}
+
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+	struct kvm_arch *arch = &vcpu->kvm->arch;
+	struct kvm_kpit_state *ps;
+
+	if (vcpu && arch->vpit) {
+		ps = &arch->vpit->pit_state;
+		if (atomic_read(&ps->pit_timer.pending) &&
+		(((arch->vpic->pics[0].imr & 1) == 0 &&
+		  arch->vpic->pics[0].irq_base == vec) ||
+		  (arch->vioapic->redirtbl[0].fields.vector == vec &&
+		  arch->vioapic->redirtbl[0].fields.mask != 1))) {
+			ps->inject_pending = 1;
+			atomic_dec(&ps->pit_timer.pending);
+			ps->channels[0].count_load_time = ktime_get();
+		}
+	}
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000000..d77d6b795a1f
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,61 @@
+#ifndef __I8254_H
+#define __I8254_H
+
+#include "iodev.h"
+
+struct kvm_kpit_timer {
+	struct hrtimer timer;
+	int irq;
+	s64 period; /* unit: ns */
+	s64 scheduled;
+	ktime_t last_update;
+	atomic_t pending;
+};
+
+struct kvm_kpit_channel_state {
+	u32 count; /* can be 65536 */
+	u16 latched_count;
+	u8 count_latched;
+	u8 status_latched;
+	u8 status;
+	u8 read_state;
+	u8 write_state;
+	u8 write_latch;
+	u8 rw_mode;
+	u8 mode;
+	u8 bcd; /* not supported */
+	u8 gate; /* timer start */
+	ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+	struct kvm_kpit_channel_state channels[3];
+	struct kvm_kpit_timer pit_timer;
+	u32    speaker_data_on;
+	struct mutex lock;
+	struct kvm_pit *pit;
+	bool inject_pending; /* if inject pending interrupts */
+	unsigned long last_injected_time;
+};
+
+struct kvm_pit {
+	unsigned long base_addresss;
+	struct kvm_io_device dev;
+	struct kvm_io_device speaker_dev;
+	struct kvm *kvm;
+	struct kvm_kpit_state pit_state;
+};
+
+#define KVM_PIT_BASE_ADDRESS	    0x40
+#define KVM_SPEAKER_BASE_ADDRESS    0x61
+#define KVM_PIT_MEM_LENGTH	    4
+#define KVM_PIT_FREQ		    1193181
+#define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
+#define KVM_PIT_CHANNEL_MASK	    0x3
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_free_pit(struct kvm *kvm);
+
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e5714759e97f..dbfe21c99c48 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 
 #include "irq.h"
+#include "i8254.h"
 
 /*
  * check if there is pending interrupt without
@@ -66,6 +67,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_apic_timer_irqs(vcpu);
+	kvm_inject_pit_timer_irqs(vcpu);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
@@ -73,6 +75,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
 	kvm_apic_timer_intr_post(vcpu, vec);
+	kvm_pit_timer_intr_post(vcpu, vec);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf78d6522d3d..c33a4578132c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -17,6 +17,7 @@
 #include <linux/kvm_host.h>
 #include "irq.h"
 #include "mmu.h"
+#include "i8254.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -818,6 +819,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
+	case KVM_CAP_PIT:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
@@ -1594,6 +1596,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		} else
 			goto out;
 		break;
+	case KVM_CREATE_PIT:
+		r = -ENOMEM;
+		kvm->arch.vpit = kvm_create_pit(kvm);
+		if (kvm->arch.vpit)
+			r = 0;
+		break;
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -3372,6 +3380,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-- 
cgit v1.2.3


From e0f63cb9277b64850854aee301762beeeb463473 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Tue, 4 Mar 2008 00:50:59 +0800
Subject: KVM: Add save/restore supporting of in kernel PIT

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c |  7 +++++++
 arch/x86/kvm/i8254.h |  1 +
 arch/x86/kvm/x86.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c7435093bbee..8642f9d1206a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -288,6 +288,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	}
 }
 
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+	mutex_lock(&kvm->arch.vpit->pit_state.lock);
+	pit_load_count(kvm, channel, val);
+	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+}
+
 static void pit_ioport_write(struct kvm_io_device *this,
 			     gpa_t addr, int len, const void *data)
 {
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d77d6b795a1f..fe09eced7783 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -55,6 +55,7 @@ struct kvm_pit {
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm);
 void kvm_free_pit(struct kvm *kvm);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c33a4578132c..621a8e362fe7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1504,6 +1504,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 	return r;
 }
 
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+	int r = 0;
+
+	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+	return r;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+	int r = 0;
+
+	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+	kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+	return r;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1657,6 +1674,37 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_GET_PIT: {
+		struct kvm_pit_state ps;
+		r = -EFAULT;
+		if (copy_from_user(&ps, argp, sizeof ps))
+			goto out;
+		r = -ENXIO;
+		if (!kvm->arch.vpit)
+			goto out;
+		r = kvm_vm_ioctl_get_pit(kvm, &ps);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &ps, sizeof ps))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_PIT: {
+		struct kvm_pit_state ps;
+		r = -EFAULT;
+		if (copy_from_user(&ps, argp, sizeof ps))
+			goto out;
+		r = -ENXIO;
+		if (!kvm->arch.vpit)
+			goto out;
+		r = kvm_vm_ioctl_set_pit(kvm, &ps);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
-- 
cgit v1.2.3


From 308b0f239e8d6754b8b903d279e5b5b987e257ac Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 13 Mar 2008 10:22:26 +0800
Subject: KVM: Add reset support for in kernel PIT

Separate the reset part and prepare for reset support.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 30 +++++++++++++++++++-----------
 arch/x86/kvm/i8254.h |  1 +
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 8642f9d1206a..9f118e2f350d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -478,12 +478,28 @@ static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
 	return (addr == KVM_SPEAKER_BASE_ADDRESS);
 }
 
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+void kvm_pit_reset(struct kvm_pit *pit)
 {
 	int i;
+	struct kvm_kpit_channel_state *c;
+
+	mutex_lock(&pit->pit_state.lock);
+	for (i = 0; i < 3; i++) {
+		c = &pit->pit_state.channels[i];
+		c->mode = 0xff;
+		c->gate = (i != 2);
+		pit_load_count(pit->kvm, i, 0);
+	}
+	mutex_unlock(&pit->pit_state.lock);
+
+	atomic_set(&pit->pit_state.pit_timer.pending, 0);
+	pit->pit_state.inject_pending = 1;
+}
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+{
 	struct kvm_pit *pit;
 	struct kvm_kpit_state *pit_state;
-	struct kvm_kpit_channel_state *c;
 
 	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
 	if (!pit)
@@ -512,17 +528,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->pit = pit;
 	hrtimer_init(&pit_state->pit_timer.timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	atomic_set(&pit_state->pit_timer.pending, 0);
-	for (i = 0; i < 3; i++) {
-		c = &pit_state->channels[i];
-		c->mode = 0xff;
-		c->gate = (i != 2);
-		pit_load_count(kvm, i, 0);
-	}
-
 	mutex_unlock(&pit->pit_state.lock);
 
-	pit->pit_state.inject_pending = 1;
+	kvm_pit_reset(pit);
 
 	return pit;
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fe09eced7783..db25c2a6c8c4 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -58,5 +58,6 @@ void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm);
 void kvm_free_pit(struct kvm *kvm);
+void kvm_pit_reset(struct kvm_pit *pit);
 
 #endif
-- 
cgit v1.2.3


From a28e4f5a621289fe0d9c8a461b0c256f9e17f3bc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Feb 2008 12:21:36 -0500
Subject: KVM: add basic paravirt support

Add basic KVM paravirt support. Avoid vm-exits on IO delays.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 621a8e362fe7..1b9e695cc641 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -820,6 +820,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
+	case KVM_CAP_NOP_IO_DELAY:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
-- 
cgit v1.2.3


From 9f81128591ca1e9907f2e7a7b195e33232167d60 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 2 Mar 2008 14:06:05 +0200
Subject: KVM: Provide unlocked version of emulator_write_phys()

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b9e695cc641..03ba402c476a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1840,22 +1840,29 @@ mmio:
 	return X86EMUL_UNHANDLEABLE;
 }
 
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			       const void *val, int bytes)
+int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			  const void *val, int bytes)
 {
 	int ret;
 
-	down_read(&vcpu->kvm->slots_lock);
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-	if (ret < 0) {
-		up_read(&vcpu->kvm->slots_lock);
+	if (ret < 0)
 		return 0;
-	}
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-	up_read(&vcpu->kvm->slots_lock);
 	return 1;
 }
 
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+			const void *val, int bytes)
+{
+	int ret;
+
+	down_read(&vcpu->kvm->slots_lock);
+	ret =__emulator_write_phys(vcpu, gpa, val, bytes);
+	up_read(&vcpu->kvm->slots_lock);
+	return ret;
+}
+
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-- 
cgit v1.2.3


From 2f333bcb4edd8daef99dabe4e7df8277af73cff1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Feb 2008 12:21:37 -0500
Subject: KVM: MMU: hypercall based pte updates and TLB flushes

Hypercall based pte updates are faster than faults, and also allow use
of the lazy MMU mode to batch operations.

Don't report the feature if two dimensional paging is enabled.

[avi:
 - one mmu_op hypercall instead of one per op
 - allow 64-bit gpa on hypercall
 - don't pass host errors (-ENOMEM) to guest]

[akpm: warning fix on i386]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |  18 ++++++-
 2 files changed, 152 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 414405b6ec13..072e9422c914 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/hugetlb.h>
+#include <linux/compiler.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -40,7 +41,7 @@
  * 2. while doing 1. it walks guest-physical to host-physical
  * If the hardware supports that we don't need to do shadow paging.
  */
-static bool tdp_enabled = false;
+bool tdp_enabled = false;
 
 #undef MMU_DEBUG
 
@@ -167,6 +168,13 @@ static int dbg = 1;
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+struct kvm_pv_mmu_op_buffer {
+	void *ptr;
+	unsigned len;
+	unsigned processed;
+	char buf[512] __aligned(sizeof(long));
+};
+
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
@@ -2003,6 +2011,132 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	return nr_mmu_pages;
 }
 
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+				unsigned len)
+{
+	if (len > buffer->len)
+		return NULL;
+	return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+				unsigned len)
+{
+	void *ret;
+
+	ret = pv_mmu_peek_buffer(buffer, len);
+	if (!ret)
+		return ret;
+	buffer->ptr += len;
+	buffer->len -= len;
+	buffer->processed += len;
+	return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+			     gpa_t addr, gpa_t value)
+{
+	int bytes = 8;
+	int r;
+
+	if (!is_long_mode(vcpu) && !is_pae(vcpu))
+		bytes = 4;
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	if (!__emulator_write_phys(vcpu, addr, &value, bytes))
+		return -EFAULT;
+
+	return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->tlb_flush(vcpu);
+	return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+			     struct kvm_pv_mmu_op_buffer *buffer)
+{
+	struct kvm_mmu_op_header *header;
+
+	header = pv_mmu_peek_buffer(buffer, sizeof *header);
+	if (!header)
+		return 0;
+	switch (header->op) {
+	case KVM_MMU_OP_WRITE_PTE: {
+		struct kvm_mmu_op_write_pte *wpte;
+
+		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+		if (!wpte)
+			return 0;
+		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+					wpte->pte_val);
+	}
+	case KVM_MMU_OP_FLUSH_TLB: {
+		struct kvm_mmu_op_flush_tlb *ftlb;
+
+		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+		if (!ftlb)
+			return 0;
+		return kvm_pv_mmu_flush_tlb(vcpu);
+	}
+	case KVM_MMU_OP_RELEASE_PT: {
+		struct kvm_mmu_op_release_pt *rpt;
+
+		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+		if (!rpt)
+			return 0;
+		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+	}
+	default: return 0;
+	}
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+		  gpa_t addr, unsigned long *ret)
+{
+	int r;
+	struct kvm_pv_mmu_op_buffer buffer;
+
+	down_read(&vcpu->kvm->slots_lock);
+	down_read(&current->mm->mmap_sem);
+
+	buffer.ptr = buffer.buf;
+	buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
+	buffer.processed = 0;
+
+	r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+	if (r)
+		goto out;
+
+	while (buffer.len) {
+		r = kvm_pv_mmu_op_one(vcpu, &buffer);
+		if (r < 0)
+			goto out;
+		if (r == 0)
+			break;
+	}
+
+	r = 1;
+out:
+	*ret = buffer.processed;
+	up_read(&current->mm->mmap_sem);
+	up_read(&vcpu->kvm->slots_lock);
+	return r;
+}
+
 #ifdef AUDIT
 
 static const char *audit_msg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03ba402c476a..63afca1c295f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -832,6 +832,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_MEMORY_SLOTS;
 		break;
+	case KVM_CAP_PV_MMU:
+		r = !tdp_enabled;
+		break;
 	default:
 		r = 0;
 		break;
@@ -2452,9 +2455,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+			   unsigned long a1)
+{
+	if (is_long_mode(vcpu))
+		return a0;
+	else
+		return a0 | ((gpa_t)a1 << 32);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
+	int r = 1;
 
 	kvm_x86_ops->cache_regs(vcpu);
 
@@ -2476,6 +2489,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
+	case KVM_HC_MMU_OP:
+		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
@@ -2483,7 +2499,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
 	kvm_x86_ops->decache_regs(vcpu);
 	++vcpu->stat.hypercalls;
-	return 0;
+	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
-- 
cgit v1.2.3


From 947da53830690cbd77d7f2b625d0df1f161ffd54 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 18 Mar 2008 11:05:52 +0200
Subject: KVM: MMU: Set the accessed bit on non-speculative shadow ptes

If we populate a shadow pte due to a fault (and not speculatively due to a
pte write) then we can set the accessed bit on it, as we know it will be
set immediately on the next guest instruction.  This saves a read-modify-write
operation.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 8 +++++---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 072e9422c914..a5872b3c466d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1020,7 +1020,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int largepage, gfn_t gfn,
-			 struct page *page)
+			 struct page *page, bool speculative)
 {
 	u64 spte;
 	int was_rmapped = 0;
@@ -1061,6 +1061,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 * demand paging).
 	 */
 	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+	if (!speculative)
+		pte_access |= PT_ACCESSED_MASK;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
 	if (!(pte_access & ACC_EXEC_MASK))
@@ -1148,13 +1150,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, page);
+				     0, write, 1, &pt_write, 0, gfn, page, false);
 			return pt_write;
 		}
 
 		if (largepage && level == 2) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				    0, write, 1, &pt_write, 1, gfn, page);
+				     0, write, 1, &pt_write, 1, gfn, page, false);
 			return pt_write;
 		}
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57abbd091143..e9ae5dba724e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -266,7 +266,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	get_page(npage);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
-		     npage);
+		     npage, true);
 }
 
 /*
@@ -349,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, page);
+		     ptwrite, largepage, walker->gfn, page, false);
 
 	return shadow_ent;
 }
-- 
cgit v1.2.3


From 855149aaa90016c576a0e684361a34f8047307d0 Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Thu, 20 Mar 2008 18:17:24 +0200
Subject: KVM: MMU: fix dirty bit setting when removing write permissions

When mmu_set_spte() checks if a page related to spte should be release as
dirty or clean, it check if the shadow pte was writeble, but in case
rmap_write_protect() is called called it is possible for shadow ptes that were
writeble to become readonly and therefor mmu_set_spte will release the pages
as clean.

This patch fix this issue by marking the page as dirty inside
rmap_write_protect().

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a5872b3c466d..dd4b95b3896b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -626,6 +626,14 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
+	if (write_protected) {
+		struct page *page;
+
+		spte = rmap_next(kvm, rmapp, NULL);
+		page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+		SetPageDirty(page);
+	}
+
 	/* check for huge page mappings */
 	rmapp = gfn_to_rmap(kvm, gfn, 1);
 	spte = rmap_next(kvm, rmapp, NULL);
-- 
cgit v1.2.3


From 0b49ea8659fd3b5005823e02d2d0a775521770e5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 23 Mar 2008 15:06:23 +0200
Subject: KVM: MMU: Introduce and use spte_to_page()

Encapsulate the pte mask'n'shift in a function.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd4b95b3896b..6fc342194dda 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -240,6 +240,13 @@ static int is_rmap_pte(u64 pte)
 	return is_shadow_present_pte(pte);
 }
 
+static struct page *spte_to_page(u64 pte)
+{
+	hfn_t hfn = (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+	return pfn_to_page(hfn);
+}
+
 static gfn_t pse36_gfn_delta(u32 gpte)
 {
 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -541,7 +548,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	if (!is_rmap_pte(*spte))
 		return;
 	sp = page_header(__pa(spte));
-	page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	page = spte_to_page(*spte);
 	mark_page_accessed(page);
 	if (is_writeble_pte(*spte))
 		kvm_release_page_dirty(page);
@@ -630,7 +637,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		struct page *page;
 
 		spte = rmap_next(kvm, rmapp, NULL);
-		page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+		page = spte_to_page(*spte);
 		SetPageDirty(page);
 	}
 
@@ -1033,7 +1040,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	u64 spte;
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
-	hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
 		 " user_fault %d gfn %lx\n",
@@ -1051,9 +1057,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, shadow_pte);
-		} else if (host_pfn != page_to_pfn(page)) {
+		} else if (page != spte_to_page(*shadow_pte)) {
 			pgprintk("hfn old %lx new %lx\n",
-				 host_pfn, page_to_pfn(page));
+				 page_to_pfn(spte_to_page(*shadow_pte)),
+				 page_to_pfn(page));
 			rmap_remove(vcpu->kvm, shadow_pte);
 		} else {
 			if (largepage)
-- 
cgit v1.2.3


From 268fe02ae058c0c5e84ad678d67e5d7b013e664f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 23 Mar 2008 18:36:30 +0200
Subject: KVM: no longer EXPERIMENTAL

Long overdue.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e793c0f..76c70ab44382 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,7 @@ if VIRTUALIZATION
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && EXPERIMENTAL
+	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
-- 
cgit v1.2.3


From 4c9fc8ef501790732ed035585b491756b75ea4c6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 24 Mar 2008 18:15:14 +0200
Subject: KVM: VMX: Add module option to disable flexpriority

Useful for debugging.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb0389d3a8d1..01559311df8c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,6 +39,9 @@ module_param(bypass_guest_pf, bool, 0);
 static int enable_vpid = 1;
 module_param(enable_vpid, bool, 0);
 
+static int flexpriority_enabled = 1;
+module_param(flexpriority_enabled, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -200,8 +203,9 @@ static inline int cpu_has_secondary_exec_ctrls(void)
 
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	return flexpriority_enabled
+		&& (vmcs_config.cpu_based_2nd_exec_ctrl &
+		    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 }
 
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
-- 
cgit v1.2.3


From 2e4d2653497856b102c90153f970c9e344ba96c6 Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Mon, 24 Mar 2008 19:38:34 +0200
Subject: KVM: x86: add functions to get the cpl of vcpu

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c |  8 ++++++++
 arch/x86/kvm/vmx.c | 15 +++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 51741f96e7fb..c1c1b973e80a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -792,6 +792,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->unusable = !var->present;
 }
 
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+	return save->cpl;
+}
+
 static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1822,6 +1829,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_segment_base = svm_get_segment_base,
 	.get_segment = svm_get_segment,
 	.set_segment = svm_set_segment,
+	.get_cpl = svm_get_cpl,
 	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
 	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
 	.set_cr0 = svm_set_cr0,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 01559311df8c..9b560325b127 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1395,6 +1395,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 	var->unusable = (ar >> 16) & 1;
 }
 
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+		return 0;
+
+	if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+		return 3;
+
+	vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
+	return kvm_seg.selector & 3;
+}
+
 static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
 	u32 ar;
@@ -2665,6 +2679,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_segment_base = vmx_get_segment_base,
 	.get_segment = vmx_get_segment,
 	.set_segment = vmx_set_segment,
+	.get_cpl = vmx_get_cpl,
 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
 	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
 	.set_cr0 = vmx_set_cr0,
-- 
cgit v1.2.3


From 37817f2982d0f559f90cecc66e150dd9d2c2df05 Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Mon, 24 Mar 2008 23:14:53 +0200
Subject: KVM: x86: hardware task switching support

This emulates the x86 hardware task switch mechanism in software, as it is
unsupported by either vmx or svm.  It allows operating systems which use it,
like freedos, to run as kvm guests.

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c |  15 +-
 arch/x86/kvm/svm.h |   3 +
 arch/x86/kvm/tss.h |  59 ++++++++
 arch/x86/kvm/vmx.c |  15 ++
 arch/x86/kvm/x86.c | 409 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 498 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kvm/tss.h

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c1c1b973e80a..ad273468c08a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1112,9 +1112,18 @@ static int invalid_op_interception(struct vcpu_svm *svm,
 static int task_switch_interception(struct vcpu_svm *svm,
 				    struct kvm_run *kvm_run)
 {
-	pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __func__);
-	kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-	return 0;
+	u16 tss_selector;
+
+	tss_selector = (u16)svm->vmcb->control.exit_info_1;
+	if (svm->vmcb->control.exit_info_2 &
+	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+		return kvm_task_switch(&svm->vcpu, tss_selector,
+				       TASK_SWITCH_IRET);
+	if (svm->vmcb->control.exit_info_2 &
+	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+		return kvm_task_switch(&svm->vcpu, tss_selector,
+				       TASK_SWITCH_JMP);
+	return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
index 5fd50491b555..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
 #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
 
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+
 #define	SVM_EXIT_READ_CR0 	0x000
 #define	SVM_EXIT_READ_CR3 	0x003
 #define	SVM_EXIT_READ_CR4 	0x004
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000000..622aa10f692f
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,59 @@
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+	u32 prev_task_link;
+	u32 esp0;
+	u32 ss0;
+	u32 esp1;
+	u32 ss1;
+	u32 esp2;
+	u32 ss2;
+	u32 cr3;
+	u32 eip;
+	u32 eflags;
+	u32 eax;
+	u32 ecx;
+	u32 edx;
+	u32 ebx;
+	u32 esp;
+	u32 ebp;
+	u32 esi;
+	u32 edi;
+	u32 es;
+	u32 cs;
+	u32 ss;
+	u32 ds;
+	u32 fs;
+	u32 gs;
+	u32 ldt_selector;
+	u16 t;
+	u16 io_map;
+};
+
+struct tss_segment_16 {
+	u16 prev_task_link;
+	u16 sp0;
+	u16 ss0;
+	u16 sp1;
+	u16 ss1;
+	u16 sp2;
+	u16 ss2;
+	u16 ip;
+	u16 flag;
+	u16 ax;
+	u16 cx;
+	u16 dx;
+	u16 bx;
+	u16 sp;
+	u16 bp;
+	u16 si;
+	u16 di;
+	u16 es;
+	u16 cs;
+	u16 ss;
+	u16 ds;
+	u16 ldt;
+};
+
+#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b560325b127..cbca46acfac3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2249,6 +2249,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	unsigned long exit_qualification;
+	u16 tss_selector;
+	int reason;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	reason = (u32)exit_qualification >> 30;
+	tss_selector = exit_qualification;
+
+	return kvm_task_switch(vcpu, tss_selector, reason);
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2271,6 +2285,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 };
 
 static const int kvm_vmx_max_exit_handlers =
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63afca1c295f..32d910044f85 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -18,6 +18,7 @@
 #include "irq.h"
 #include "mmu.h"
 #include "i8254.h"
+#include "tss.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -3077,6 +3078,414 @@ static void set_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->set_segment(vcpu, var, seg);
 }
 
+static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
+				   struct kvm_segment *kvm_desct)
+{
+	kvm_desct->base = seg_desc->base0;
+	kvm_desct->base |= seg_desc->base1 << 16;
+	kvm_desct->base |= seg_desc->base2 << 24;
+	kvm_desct->limit = seg_desc->limit0;
+	kvm_desct->limit |= seg_desc->limit << 16;
+	kvm_desct->selector = selector;
+	kvm_desct->type = seg_desc->type;
+	kvm_desct->present = seg_desc->p;
+	kvm_desct->dpl = seg_desc->dpl;
+	kvm_desct->db = seg_desc->d;
+	kvm_desct->s = seg_desc->s;
+	kvm_desct->l = seg_desc->l;
+	kvm_desct->g = seg_desc->g;
+	kvm_desct->avl = seg_desc->avl;
+	if (!selector)
+		kvm_desct->unusable = 1;
+	else
+		kvm_desct->unusable = 0;
+	kvm_desct->padding = 0;
+}
+
+static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
+					   u16 selector,
+					   struct descriptor_table *dtable)
+{
+	if (selector & 1 << 2) {
+		struct kvm_segment kvm_seg;
+
+		get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+		if (kvm_seg.unusable)
+			dtable->limit = 0;
+		else
+			dtable->limit = kvm_seg.limit;
+		dtable->base = kvm_seg.base;
+	}
+	else
+		kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+					 struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	u16 index = selector >> 3;
+
+	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+		return 1;
+	}
+	return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+					 struct desc_struct *seg_desc)
+{
+	struct descriptor_table dtable;
+	u16 index = selector >> 3;
+
+	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+
+	if (dtable.limit < index * 8 + 7)
+		return 1;
+	return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+
+static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+			     struct desc_struct *seg_desc)
+{
+	u32 base_addr;
+
+	base_addr = seg_desc->base0;
+	base_addr |= (seg_desc->base1 << 16);
+	base_addr |= (seg_desc->base2 << 24);
+
+	return base_addr;
+}
+
+static int load_tss_segment32(struct kvm_vcpu *vcpu,
+			      struct desc_struct *seg_desc,
+			      struct tss_segment_32 *tss)
+{
+	u32 base_addr;
+
+	base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+	return kvm_read_guest(vcpu->kvm, base_addr, tss,
+			      sizeof(struct tss_segment_32));
+}
+
+static int save_tss_segment32(struct kvm_vcpu *vcpu,
+			      struct desc_struct *seg_desc,
+			      struct tss_segment_32 *tss)
+{
+	u32 base_addr;
+
+	base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+	return kvm_write_guest(vcpu->kvm, base_addr, tss,
+			       sizeof(struct tss_segment_32));
+}
+
+static int load_tss_segment16(struct kvm_vcpu *vcpu,
+			      struct desc_struct *seg_desc,
+			      struct tss_segment_16 *tss)
+{
+	u32 base_addr;
+
+	base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+	return kvm_read_guest(vcpu->kvm, base_addr, tss,
+			      sizeof(struct tss_segment_16));
+}
+
+static int save_tss_segment16(struct kvm_vcpu *vcpu,
+			      struct desc_struct *seg_desc,
+			      struct tss_segment_16 *tss)
+{
+	u32 base_addr;
+
+	base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+	return kvm_write_guest(vcpu->kvm, base_addr, tss,
+			       sizeof(struct tss_segment_16));
+}
+
+static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment kvm_seg;
+
+	get_segment(vcpu, &kvm_seg, seg);
+	return kvm_seg.selector;
+}
+
+static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
+						u16 selector,
+						struct kvm_segment *kvm_seg)
+{
+	struct desc_struct seg_desc;
+
+	if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
+		return 1;
+	seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
+	return 0;
+}
+
+static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+				   int type_bits, int seg)
+{
+	struct kvm_segment kvm_seg;
+
+	if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
+		return 1;
+	kvm_seg.type |= type_bits;
+
+	if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
+	    seg != VCPU_SREG_LDTR)
+		if (!kvm_seg.s)
+			kvm_seg.unusable = 1;
+
+	set_segment(vcpu, &kvm_seg, seg);
+	return 0;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+				struct tss_segment_32 *tss)
+{
+	tss->cr3 = vcpu->arch.cr3;
+	tss->eip = vcpu->arch.rip;
+	tss->eflags = kvm_x86_ops->get_rflags(vcpu);
+	tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
+	tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+	tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
+	tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
+	tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
+	tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
+	tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
+	tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
+
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+				  struct tss_segment_32 *tss)
+{
+	kvm_set_cr3(vcpu, tss->cr3);
+
+	vcpu->arch.rip = tss->eip;
+	kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
+
+	vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
+	vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
+	vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
+	vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
+	vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
+	vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
+	vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
+	vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+
+	if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+		return 1;
+	return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+				struct tss_segment_16 *tss)
+{
+	tss->ip = vcpu->arch.rip;
+	tss->flag = kvm_x86_ops->get_rflags(vcpu);
+	tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
+	tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
+	tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
+	tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
+	tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
+	tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
+	tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
+	tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+
+	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+				 struct tss_segment_16 *tss)
+{
+	vcpu->arch.rip = tss->ip;
+	kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
+	vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
+	vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
+	vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
+	vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
+	vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
+	vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
+	vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
+	vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+
+	if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+		return 1;
+
+	if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+		return 1;
+	return 0;
+}
+
+int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
+		       struct desc_struct *cseg_desc,
+		       struct desc_struct *nseg_desc)
+{
+	struct tss_segment_16 tss_segment_16;
+	int ret = 0;
+
+	if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
+		goto out;
+
+	save_state_to_tss16(vcpu, &tss_segment_16);
+	save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
+
+	if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
+		goto out;
+	if (load_state_from_tss16(vcpu, &tss_segment_16))
+		goto out;
+
+	ret = 1;
+out:
+	return ret;
+}
+
+int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
+		       struct desc_struct *cseg_desc,
+		       struct desc_struct *nseg_desc)
+{
+	struct tss_segment_32 tss_segment_32;
+	int ret = 0;
+
+	if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
+		goto out;
+
+	save_state_to_tss32(vcpu, &tss_segment_32);
+	save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
+
+	if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
+		goto out;
+	if (load_state_from_tss32(vcpu, &tss_segment_32))
+		goto out;
+
+	ret = 1;
+out:
+	return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+{
+	struct kvm_segment tr_seg;
+	struct desc_struct cseg_desc;
+	struct desc_struct nseg_desc;
+	int ret = 0;
+
+	get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+
+	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+		goto out;
+
+	if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
+		goto out;
+
+
+	if (reason != TASK_SWITCH_IRET) {
+		int cpl;
+
+		cpl = kvm_x86_ops->get_cpl(vcpu);
+		if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
+			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+			return 1;
+		}
+	}
+
+	if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+		return 1;
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		cseg_desc.type &= ~(1 << 8); //clear the B flag
+		save_guest_segment_descriptor(vcpu, tr_seg.selector,
+					      &cseg_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET) {
+		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+	}
+
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	kvm_x86_ops->cache_regs(vcpu);
+
+	if (nseg_desc.type & 8)
+		ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
+					 &nseg_desc);
+	else
+		ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
+					 &nseg_desc);
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+		kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+	}
+
+	if (reason != TASK_SWITCH_IRET) {
+		nseg_desc.type |= (1 << 8);
+		save_guest_segment_descriptor(vcpu, tss_selector,
+					      &nseg_desc);
+	}
+
+	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+	tr_seg.type = 11;
+	set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+	kvm_x86_ops->decache_regs(vcpu);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-- 
cgit v1.2.3


From 25c5f225beda4fbea878ed8b6203ab4ecc7de2d1 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 28 Mar 2008 13:18:56 +0800
Subject: KVM: VMX: Enable MSR Bitmap feature

MSR Bitmap controls whether the accessing of an MSR causes VM Exit.
Eliminating exits on automatically saved and restored MSRs yields a
small performance gain.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cbca46acfac3..87eee7a7f16e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -91,6 +91,7 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 
 static struct page *vmx_io_bitmap_a;
 static struct page *vmx_io_bitmap_b;
+static struct page *vmx_msr_bitmap;
 
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -185,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info)
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
+static inline int cpu_has_vmx_msr_bitmap(void)
+{
+	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+}
+
 static inline int cpu_has_vmx_tpr_shadow(void)
 {
 	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
@@ -1001,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING;
 	opt = CPU_BASED_TPR_SHADOW |
+	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 				&_cpu_based_exec_control) < 0)
@@ -1575,6 +1582,30 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
+void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+{
+	void *va;
+
+	if (!cpu_has_vmx_msr_bitmap())
+		return;
+
+	/*
+	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+	 * have the write-low and read-high bitmap offsets the wrong way round.
+	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+	 */
+	va = kmap(msr_bitmap);
+	if (msr <= 0x1fff) {
+		__clear_bit(msr, va + 0x000); /* read-low */
+		__clear_bit(msr, va + 0x800); /* write-low */
+	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+		msr &= 0x1fff;
+		__clear_bit(msr, va + 0x400); /* read-high */
+		__clear_bit(msr, va + 0xc00); /* write-high */
+	}
+	kunmap(msr_bitmap);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -1592,6 +1623,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
 	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
 
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
@@ -2728,7 +2762,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-	void *iova;
+	void *va;
 	int r;
 
 	vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2741,30 +2775,48 @@ static int __init vmx_init(void)
 		goto out;
 	}
 
+	vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	if (!vmx_msr_bitmap) {
+		r = -ENOMEM;
+		goto out1;
+	}
+
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
 	 */
-	iova = kmap(vmx_io_bitmap_a);
-	memset(iova, 0xff, PAGE_SIZE);
-	clear_bit(0x80, iova);
+	va = kmap(vmx_io_bitmap_a);
+	memset(va, 0xff, PAGE_SIZE);
+	clear_bit(0x80, va);
 	kunmap(vmx_io_bitmap_a);
 
-	iova = kmap(vmx_io_bitmap_b);
-	memset(iova, 0xff, PAGE_SIZE);
+	va = kmap(vmx_io_bitmap_b);
+	memset(va, 0xff, PAGE_SIZE);
 	kunmap(vmx_io_bitmap_b);
 
+	va = kmap(vmx_msr_bitmap);
+	memset(va, 0xff, PAGE_SIZE);
+	kunmap(vmx_msr_bitmap);
+
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out1;
+		goto out2;
+
+	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
+	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
+	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
+	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
+	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
 
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
 	return 0;
 
+out2:
+	__free_page(vmx_msr_bitmap);
 out1:
 	__free_page(vmx_io_bitmap_b);
 out:
@@ -2774,6 +2826,7 @@ out:
 
 static void __exit vmx_exit(void)
 {
+	__free_page(vmx_msr_bitmap);
 	__free_page(vmx_io_bitmap_b);
 	__free_page(vmx_io_bitmap_a);
 
-- 
cgit v1.2.3


From 3200f405a1e8e06c8634f11d33614455baa4e6be Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo@kvack.org>
Date: Sat, 29 Mar 2008 20:17:59 -0300
Subject: KVM: MMU: unify slots_lock usage

Unify slots_lock acquision around vcpu_run(). This is simpler and less
error-prone.

Also fix some callsites that were not grabbing the lock properly.

[avi: drop slots_lock while in guest mode to avoid holding the lock
      for indefinite periods]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 13 ++----------
 arch/x86/kvm/paging_tmpl.h |  4 ----
 arch/x86/kvm/vmx.c         |  6 +++---
 arch/x86/kvm/x86.c         | 53 +++++++++++++++++-----------------------------
 4 files changed, 25 insertions(+), 51 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6fc342194dda..c563283cb982 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1204,8 +1204,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	struct page *page;
 
-	down_read(&vcpu->kvm->slots_lock);
-
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
@@ -1218,7 +1216,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	/* mmio */
 	if (is_error_page(page)) {
 		kvm_release_page_clean(page);
-		up_read(&vcpu->kvm->slots_lock);
 		return 1;
 	}
 
@@ -1228,7 +1225,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 			 PT32E_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
-	up_read(&vcpu->kvm->slots_lock);
 
 	return r;
 }
@@ -1376,9 +1372,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		largepage = 1;
 	}
 	page = gfn_to_page(vcpu->kvm, gfn);
+	up_read(&current->mm->mmap_sem);
 	if (is_error_page(page)) {
 		kvm_release_page_clean(page);
-		up_read(&current->mm->mmap_sem);
 		return 1;
 	}
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -1386,7 +1382,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
 			 largepage, gfn, page, TDP_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
 
 	return r;
 }
@@ -1808,9 +1803,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-	up_read(&vcpu->kvm->slots_lock);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2063,7 +2056,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 	if (r)
 		return r;
 
-	if (!__emulator_write_phys(vcpu, addr, &value, bytes))
+	if (!emulator_write_phys(vcpu, addr, &value, bytes))
 		return -EFAULT;
 
 	return 1;
@@ -2127,7 +2120,6 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 	int r;
 	struct kvm_pv_mmu_op_buffer buffer;
 
-	down_read(&vcpu->kvm->slots_lock);
 	down_read(&current->mm->mmap_sem);
 
 	buffer.ptr = buffer.buf;
@@ -2150,7 +2142,6 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 out:
 	*ret = buffer.processed;
 	up_read(&current->mm->mmap_sem);
-	up_read(&vcpu->kvm->slots_lock);
 	return r;
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e9ae5dba724e..57d872aec663 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -388,7 +388,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (r)
 		return r;
 
-	down_read(&vcpu->kvm->slots_lock);
 	/*
 	 * Look up the shadow pte for the faulting address.
 	 */
@@ -402,7 +401,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		pgprintk("%s: guest page fault\n", __func__);
 		inject_page_fault(vcpu, addr, walker.error_code);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-		up_read(&vcpu->kvm->slots_lock);
 		return 0;
 	}
 
@@ -422,7 +420,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (is_error_page(page)) {
 		pgprintk("gfn %x is mmio\n", walker.gfn);
 		kvm_release_page_clean(page);
-		up_read(&vcpu->kvm->slots_lock);
 		return 1;
 	}
 
@@ -440,7 +437,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	up_read(&vcpu->kvm->slots_lock);
 
 	return write_pt;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 87eee7a7f16e..6249810b2155 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1505,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm)
 	int ret = 0;
 	int r;
 
-	down_read(&kvm->slots_lock);
 	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 	if (r < 0)
 		goto out;
@@ -1528,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm)
 
 	ret = 1;
 out:
-	up_read(&kvm->slots_lock);
 	return ret;
 }
 
@@ -1730,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	u64 msr;
 	int ret;
 
+	down_read(&vcpu->kvm->slots_lock);
 	if (!init_rmode_tss(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
@@ -1833,9 +1832,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vpid_sync_vcpu_all(vmx);
 
-	return 0;
+	ret = 0;
 
 out:
+	up_read(&vcpu->kvm->slots_lock);
 	return ret;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32d910044f85..e6a38bf9a45e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -201,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	int ret;
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-	down_read(&vcpu->kvm->slots_lock);
 	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 				  offset * sizeof(u64), sizeof(pdpte));
 	if (ret < 0) {
@@ -218,7 +217,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-	up_read(&vcpu->kvm->slots_lock);
 
 	return ret;
 }
@@ -233,13 +231,11 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
 		return false;
 
-	down_read(&vcpu->kvm->slots_lock);
 	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-	up_read(&vcpu->kvm->slots_lock);
 
 	return changed;
 }
@@ -377,7 +373,6 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 */
 	}
 
-	down_read(&vcpu->kvm->slots_lock);
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
@@ -393,7 +388,6 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		vcpu->arch.cr3 = cr3;
 		vcpu->arch.mmu.new_cr3(vcpu);
 	}
-	up_read(&vcpu->kvm->slots_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
@@ -503,7 +497,6 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	version++;
 
-	down_read(&kvm->slots_lock);
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
 	wc_ts = current_kernel_time();
@@ -515,7 +508,6 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	version++;
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-	up_read(&kvm->slots_lock);
 }
 
 static void kvm_write_guest_time(struct kvm_vcpu *v)
@@ -609,10 +601,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		vcpu->arch.hv_clock.tsc_shift = 22;
 
 		down_read(&current->mm->mmap_sem);
-		down_read(&vcpu->kvm->slots_lock);
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-		up_read(&vcpu->kvm->slots_lock);
 		up_read(&current->mm->mmap_sem);
 
 		if (is_error_page(vcpu->arch.time_page)) {
@@ -715,9 +705,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 
 	vcpu_load(vcpu);
 
+	down_read(&vcpu->kvm->slots_lock);
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
+	up_read(&vcpu->kvm->slots_lock);
 
 	vcpu_put(vcpu);
 
@@ -1768,7 +1760,6 @@ int emulator_read_std(unsigned long addr,
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
-	down_read(&vcpu->kvm->slots_lock);
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
@@ -1790,7 +1781,6 @@ int emulator_read_std(unsigned long addr,
 		addr += tocopy;
 	}
 out:
-	up_read(&vcpu->kvm->slots_lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1809,9 +1799,7 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&vcpu->kvm->slots_lock);
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1844,7 +1832,7 @@ mmio:
 	return X86EMUL_UNHANDLEABLE;
 }
 
-int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes)
 {
 	int ret;
@@ -1856,17 +1844,6 @@ int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	return 1;
 }
 
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			const void *val, int bytes)
-{
-	int ret;
-
-	down_read(&vcpu->kvm->slots_lock);
-	ret =__emulator_write_phys(vcpu, gpa, val, bytes);
-	up_read(&vcpu->kvm->slots_lock);
-	return ret;
-}
-
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
@@ -1875,9 +1852,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	struct kvm_io_device *mmio_dev;
 	gpa_t                 gpa;
 
-	down_read(&vcpu->kvm->slots_lock);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
-	up_read(&vcpu->kvm->slots_lock);
 
 	if (gpa == UNMAPPED_GVA) {
 		kvm_inject_page_fault(vcpu, addr, 2);
@@ -1954,7 +1929,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		char *kaddr;
 		u64 val;
 
-		down_read(&vcpu->kvm->slots_lock);
 		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 
 		if (gpa == UNMAPPED_GVA ||
@@ -1974,9 +1948,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
 		kunmap_atomic(kaddr, KM_USER0);
 		kvm_release_page_dirty(page);
-	emul_write:
-		up_read(&vcpu->kvm->slots_lock);
 	}
+emul_write:
 #endif
 
 	return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -2368,10 +2341,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	for (i = 0; i < nr_pages; ++i) {
-		down_read(&vcpu->kvm->slots_lock);
 		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
 		vcpu->arch.pio.guest_pages[i] = page;
-		up_read(&vcpu->kvm->slots_lock);
 		if (!page) {
 			kvm_inject_gp(vcpu, 0);
 			free_pio_guest_pages(vcpu);
@@ -2445,7 +2416,9 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 	++vcpu->stat.halt_exits;
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+		up_read(&vcpu->kvm->slots_lock);
 		kvm_vcpu_block(vcpu);
+		down_read(&vcpu->kvm->slots_lock);
 		if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
 			return -EINTR;
 		return 1;
@@ -2738,6 +2711,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
 	}
 
+	down_read(&vcpu->kvm->slots_lock);
 	vapic_enter(vcpu);
 
 preempted:
@@ -2811,6 +2785,8 @@ again:
 
 	kvm_lapic_sync_to_vapic(vcpu);
 
+	up_read(&vcpu->kvm->slots_lock);
+
 	vcpu->guest_mode = 1;
 	kvm_guest_enter();
 
@@ -2837,6 +2813,8 @@ again:
 
 	preempt_enable();
 
+	down_read(&vcpu->kvm->slots_lock);
+
 	/*
 	 * Profile KVM exit RIPs:
 	 */
@@ -2864,14 +2842,18 @@ again:
 	}
 
 out:
+	up_read(&vcpu->kvm->slots_lock);
 	if (r > 0) {
 		kvm_resched(vcpu);
+		down_read(&vcpu->kvm->slots_lock);
 		goto preempted;
 	}
 
 	post_kvm_run_save(vcpu, kvm_run);
 
+	down_read(&vcpu->kvm->slots_lock);
 	vapic_exit(vcpu);
+	up_read(&vcpu->kvm->slots_lock);
 
 	return r;
 }
@@ -2906,9 +2888,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 		vcpu->mmio_read_completed = 1;
 		vcpu->mmio_needed = 0;
+
+		down_read(&vcpu->kvm->slots_lock);
 		r = emulate_instruction(vcpu, kvm_run,
 					vcpu->arch.mmio_fault_cr2, 0,
 					EMULTYPE_NO_DECODE);
+		up_read(&vcpu->kvm->slots_lock);
 		if (r == EMULATE_DO_MMIO) {
 			/*
 			 * Read-modify-write.  Back to userspace.
@@ -3817,7 +3802,9 @@ fail:
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvm_free_lapic(vcpu);
+	down_read(&vcpu->kvm->slots_lock);
 	kvm_mmu_destroy(vcpu);
+	up_read(&vcpu->kvm->slots_lock);
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-- 
cgit v1.2.3


From 3ee16c814511cd58f956b47b9c7654f57f674688 Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Sun, 30 Mar 2008 15:17:21 +0300
Subject: KVM: MMU: allow the vm to shrink the kvm mmu shadow caches

Allow the Linux memory manager to reclaim memory in the kvm shadow cache.

Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c563283cb982..1594ee06c920 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1966,7 +1966,53 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	kvm_flush_remote_tlbs(kvm);
 }
 
-void kvm_mmu_module_exit(void)
+void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+{
+	struct kvm_mmu_page *page;
+
+	page = container_of(kvm->arch.active_mmu_pages.prev,
+			    struct kvm_mmu_page, link);
+	kvm_mmu_zap_page(kvm, page);
+}
+
+static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+{
+	struct kvm *kvm;
+	struct kvm *kvm_freed = NULL;
+	int cache_count = 0;
+
+	spin_lock(&kvm_lock);
+
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		int npages;
+
+		spin_lock(&kvm->mmu_lock);
+		npages = kvm->arch.n_alloc_mmu_pages -
+			 kvm->arch.n_free_mmu_pages;
+		cache_count += npages;
+		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+			kvm_mmu_remove_one_alloc_mmu_page(kvm);
+			cache_count--;
+			kvm_freed = kvm;
+		}
+		nr_to_scan--;
+
+		spin_unlock(&kvm->mmu_lock);
+	}
+	if (kvm_freed)
+		list_move_tail(&kvm_freed->vm_list, &vm_list);
+
+	spin_unlock(&kvm_lock);
+
+	return cache_count;
+}
+
+static struct shrinker mmu_shrinker = {
+	.shrink = mmu_shrink,
+	.seeks = DEFAULT_SEEKS * 10,
+};
+
+void mmu_destroy_caches(void)
 {
 	if (pte_chain_cache)
 		kmem_cache_destroy(pte_chain_cache);
@@ -1976,6 +2022,12 @@ void kvm_mmu_module_exit(void)
 		kmem_cache_destroy(mmu_page_header_cache);
 }
 
+void kvm_mmu_module_exit(void)
+{
+	mmu_destroy_caches();
+	unregister_shrinker(&mmu_shrinker);
+}
+
 int kvm_mmu_module_init(void)
 {
 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -1995,10 +2047,12 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+	register_shrinker(&mmu_shrinker);
+
 	return 0;
 
 nomem:
-	kvm_mmu_module_exit();
+	mmu_destroy_caches();
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3


From 3d45830c2b11a9d756faae161742b7d1ec417f7e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 25 Mar 2008 11:26:13 +0200
Subject: KVM: Free apic access page on vm destruction

Noticed by Marcelo Tosatti.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6a38bf9a45e..c7ad2352227a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3853,6 +3853,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
+	if (kvm->arch.apic_access_page)
+		put_page(kvm->arch.apic_access_page);
 	kfree(kvm);
 }
 
-- 
cgit v1.2.3


From fcd6dbac9267c1c06a205ad8bb4bd027c0ace7f7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Apr 2008 12:02:21 +0300
Subject: KVM: MMU: Only mark_page_accessed() if the page was accessed by the
 guest

If the accessed bit is not set, the guest has never accessed this page
(at least through this spte), so there's no need to mark the page
accessed.  This provides more accurate data for the eviction algortithm.

Noted by Andrea Arcangeli.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1594ee06c920..5c4c16662c68 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -549,7 +549,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		return;
 	sp = page_header(__pa(spte));
 	page = spte_to_page(*spte);
-	mark_page_accessed(page);
+	if (*spte & PT_ACCESSED_MASK)
+		mark_page_accessed(page);
 	if (is_writeble_pte(*spte))
 		kvm_release_page_dirty(page);
 	else
-- 
cgit v1.2.3


From bed1d1dfc4a458d82bcd258082638cbba860190d Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 4 Apr 2008 14:56:44 -0300
Subject: KVM: MMU: prepopulate guest pages after write-protecting

Zdenek reported a bug where a looping "dmsetup status" eventually hangs
on SMP guests.

The problem is that kvm_mmu_get_page() prepopulates the shadow MMU
before write protecting the guest page tables. By doing so, it leaves a
window open where the guest can mark a pte as present while the host has
shadow cached such pte as "notrap". Accesses to such address will fault
in the guest without the host having a chance to fix the situation.

Fix by moving the write protection before the pte prefetch.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c4c16662c68..c89bf230af67 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -852,9 +852,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->gfn = gfn;
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
-	vcpu->arch.mmu.prefetch_page(vcpu, sp);
 	if (!metaphysical)
 		rmap_write_protect(vcpu->kvm, gfn);
+	vcpu->arch.mmu.prefetch_page(vcpu, sp);
 	return sp;
 }
 
-- 
cgit v1.2.3


From 35149e2129fe34fc8cb5917e1ecf5156b0fa3415 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Wed, 2 Apr 2008 14:46:56 -0500
Subject: KVM: MMU: Don't assume struct page for x86

This patch introduces a gfn_to_pfn() function and corresponding functions like
kvm_release_pfn_dirty().  Using these new functions, we can modify the x86
MMU to no longer assume that it can always get a struct page for any given gfn.

We don't want to eliminate gfn_to_page() entirely because a number of places
assume they can do gfn_to_page() and then kmap() the results.  When we support
IO memory, gfn_to_page() will fail for IO pages although gfn_to_pfn() will
succeed.

This does not implement support for avoiding reference counting for reserved
RAM or for IO memory.  However, it should make those things pretty straight
forward.

Since we're only introducing new common symbols, I don't think it will break
the non-x86 architectures but I haven't tested those.  I've tested Intel,
AMD, NPT, and hugetlbfs with Windows and Linux guests.

[avi: fix overflow when shifting left pfns by adding casts]

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 89 ++++++++++++++++++++++------------------------
 arch/x86/kvm/paging_tmpl.h | 26 +++++++-------
 2 files changed, 56 insertions(+), 59 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c89bf230af67..078a7f1ac34c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -240,11 +240,9 @@ static int is_rmap_pte(u64 pte)
 	return is_shadow_present_pte(pte);
 }
 
-static struct page *spte_to_page(u64 pte)
+static pfn_t spte_to_pfn(u64 pte)
 {
-	hfn_t hfn = (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-
-	return pfn_to_page(hfn);
+	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
 
 static gfn_t pse36_gfn_delta(u32 gpte)
@@ -541,20 +539,20 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
-	struct page *page;
+	pfn_t pfn;
 	unsigned long *rmapp;
 	int i;
 
 	if (!is_rmap_pte(*spte))
 		return;
 	sp = page_header(__pa(spte));
-	page = spte_to_page(*spte);
+	pfn = spte_to_pfn(*spte);
 	if (*spte & PT_ACCESSED_MASK)
-		mark_page_accessed(page);
+		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
-		kvm_release_page_dirty(page);
+		kvm_release_pfn_dirty(pfn);
 	else
-		kvm_release_page_clean(page);
+		kvm_release_pfn_clean(pfn);
 	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -635,11 +633,11 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 	if (write_protected) {
-		struct page *page;
+		pfn_t pfn;
 
 		spte = rmap_next(kvm, rmapp, NULL);
-		page = spte_to_page(*spte);
-		SetPageDirty(page);
+		pfn = spte_to_pfn(*spte);
+		kvm_set_pfn_dirty(pfn);
 	}
 
 	/* check for huge page mappings */
@@ -1036,7 +1034,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int largepage, gfn_t gfn,
-			 struct page *page, bool speculative)
+			 pfn_t pfn, bool speculative)
 {
 	u64 spte;
 	int was_rmapped = 0;
@@ -1058,10 +1056,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, shadow_pte);
-		} else if (page != spte_to_page(*shadow_pte)) {
+		} else if (pfn != spte_to_pfn(*shadow_pte)) {
 			pgprintk("hfn old %lx new %lx\n",
-				 page_to_pfn(spte_to_page(*shadow_pte)),
-				 page_to_pfn(page));
+				 spte_to_pfn(*shadow_pte), pfn);
 			rmap_remove(vcpu->kvm, shadow_pte);
 		} else {
 			if (largepage)
@@ -1090,7 +1087,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
 
-	spte |= page_to_phys(page);
+	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
@@ -1135,12 +1132,12 @@ unshadowed:
 	if (!was_rmapped) {
 		rmap_add(vcpu, shadow_pte, gfn, largepage);
 		if (!is_rmap_pte(*shadow_pte))
-			kvm_release_page_clean(page);
+			kvm_release_pfn_clean(pfn);
 	} else {
 		if (was_writeble)
-			kvm_release_page_dirty(page);
+			kvm_release_pfn_dirty(pfn);
 		else
-			kvm_release_page_clean(page);
+			kvm_release_pfn_clean(pfn);
 	}
 	if (!ptwrite || !*ptwrite)
 		vcpu->arch.last_pte_updated = shadow_pte;
@@ -1151,7 +1148,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			   int largepage, gfn_t gfn, struct page *page,
+			   int largepage, gfn_t gfn, pfn_t pfn,
 			   int level)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
@@ -1166,13 +1163,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, page, false);
+				     0, write, 1, &pt_write, 0, gfn, pfn, false);
 			return pt_write;
 		}
 
 		if (largepage && level == 2) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 1, gfn, page, false);
+				     0, write, 1, &pt_write, 1, gfn, pfn, false);
 			return pt_write;
 		}
 
@@ -1187,7 +1184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 						     1, ACC_ALL, &table[index]);
 			if (!new_table) {
 				pgprintk("nonpaging_map: ENOMEM\n");
-				kvm_release_page_clean(page);
+				kvm_release_pfn_clean(pfn);
 				return -ENOMEM;
 			}
 
@@ -1202,8 +1199,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 	int r;
 	int largepage = 0;
-
-	struct page *page;
+	pfn_t pfn;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1211,18 +1207,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
-	page = gfn_to_page(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
 		return 1;
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, largepage, gfn, page,
+	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
 			 PT32E_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
@@ -1355,7 +1351,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 				u32 error_code)
 {
-	struct page *page;
+	pfn_t pfn;
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -1372,16 +1368,16 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
-	page = gfn_to_page(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
 		return 1;
 	}
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, page, TDP_ROOT_LEVEL);
+			 largepage, gfn, pfn, TDP_ROOT_LEVEL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -1525,6 +1521,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.update_pte.pfn = bad_pfn;
+
 	if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
@@ -1644,7 +1642,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn;
 	int r;
 	u64 gpte = 0;
-	struct page *page;
+	pfn_t pfn;
 
 	vcpu->arch.update_pte.largepage = 0;
 
@@ -1680,15 +1678,15 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
-	page = gfn_to_page(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
 		return;
 	}
 	vcpu->arch.update_pte.gfn = gfn;
-	vcpu->arch.update_pte.page = page;
+	vcpu->arch.update_pte.pfn = pfn;
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1793,9 +1791,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.update_pte.page) {
-		kvm_release_page_clean(vcpu->arch.update_pte.page);
-		vcpu->arch.update_pte.page = NULL;
+	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
+		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
+		vcpu->arch.update_pte.pfn = bad_pfn;
 	}
 }
 
@@ -2236,8 +2234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 			audit_mappings_page(vcpu, ent, va, level - 1);
 		} else {
 			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-			struct page *page = gpa_to_page(vcpu, gpa);
-			hpa_t hpa = page_to_phys(page);
+			hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
 
 			if (is_shadow_present_pte(ent)
 			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -2250,7 +2247,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				 && !is_error_hpa(hpa))
 				printk(KERN_ERR "audit: (%s) notrap shadow,"
 				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_page_clean(page);
+			kvm_release_pfn_clean(pfn);
 
 		}
 	}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57d872aec663..156fe10288ae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -247,7 +247,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 {
 	pt_element_t gpte;
 	unsigned pte_access;
-	struct page *npage;
+	pfn_t pfn;
 	int largepage = vcpu->arch.update_pte.largepage;
 
 	gpte = *(const pt_element_t *)pte;
@@ -260,13 +260,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 		return;
-	npage = vcpu->arch.update_pte.page;
-	if (!npage)
+	pfn = vcpu->arch.update_pte.pfn;
+	if (is_error_pfn(pfn))
 		return;
-	get_page(npage);
+	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
-		     npage, true);
+		     pfn, true);
 }
 
 /*
@@ -275,7 +275,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *walker,
 			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, struct page *page)
+			 int *ptwrite, pfn_t pfn)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -336,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 						  walker->pte_gpa[level - 2],
 						  &curr_pte, sizeof(curr_pte));
 			if (r || curr_pte != walker->ptes[level - 2]) {
-				kvm_release_page_clean(page);
+				kvm_release_pfn_clean(pfn);
 				return NULL;
 			}
 		}
@@ -349,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, page, false);
+		     ptwrite, largepage, walker->gfn, pfn, false);
 
 	return shadow_ent;
 }
@@ -378,7 +378,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	u64 *shadow_pte;
 	int write_pt = 0;
 	int r;
-	struct page *page;
+	pfn_t pfn;
 	int largepage = 0;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -413,20 +413,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
-	page = gfn_to_page(vcpu->kvm, walker.gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
-	if (is_error_page(page)) {
+	if (is_error_pfn(pfn)) {
 		pgprintk("gfn %x is mmio\n", walker.gfn);
-		kvm_release_page_clean(page);
+		kvm_release_pfn_clean(pfn);
 		return 1;
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  largepage, &write_pt, page);
+				  largepage, &write_pt, pfn);
 
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 shadow_pte, *shadow_pte, write_pt);
-- 
cgit v1.2.3


From ec077263b2bb841d973d82342b7fbc07bbad4246 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 9 Apr 2008 14:15:28 +0200
Subject: KVM: SVM: indent svm_set_cr4 with tabs instead of spaces

The svm_set_cr4 function is indented with spaces. This patch replaces
them with tabs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ad273468c08a..d7439ceb2ac2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -878,10 +878,10 @@ set:
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       vcpu->arch.cr4 = cr4;
-       if (!npt_enabled)
-	       cr4 |= X86_CR4_PAE;
-       to_svm(vcpu)->vmcb->save.cr4 = cr4;
+	vcpu->arch.cr4 = cr4;
+	if (!npt_enabled)
+		cr4 |= X86_CR4_PAE;
+	to_svm(vcpu)->vmcb->save.cr4 = cr4;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 6394b6494c0a352a2db3ea3e891ba7aeea7c1441 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 9 Apr 2008 14:15:29 +0200
Subject: KVM: SVM: align shadow CR4.MCE with host

This patch aligns the host version of the CR4.MCE bit with the CR4 active in
the guest. This is necessary to get MCE exceptions when the guest is running.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d7439ceb2ac2..8af463b91526 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -878,9 +878,12 @@ set:
 
 static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
+	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+
 	vcpu->arch.cr4 = cr4;
 	if (!npt_enabled)
 		cr4 |= X86_CR4_PAE;
+	cr4 |= host_cr4_mce;
 	to_svm(vcpu)->vmcb->save.cr4 = cr4;
 }
 
-- 
cgit v1.2.3


From 53371b5098543ab09dcb0c7ce31da887dbe58c62 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 9 Apr 2008 14:15:30 +0200
Subject: KVM: SVM: add intercept for machine check exception

To properly forward a MCE occured while the guest is running to the host, we
have to intercept this exception and call the host handler by hand. This is
implemented by this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8af463b91526..da3ddef47605 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -507,7 +507,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 					INTERCEPT_DR7_MASK;
 
 	control->intercept_exceptions = (1 << PF_VECTOR) |
-					(1 << UD_VECTOR);
+					(1 << UD_VECTOR) |
+					(1 << MC_VECTOR);
 
 
 	control->intercept = 	(1ULL << INTERCEPT_INTR) |
@@ -1044,6 +1045,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	/*
+	 * On an #MC intercept the MCE handler is not called automatically in
+	 * the host. So do it by hand here.
+	 */
+	asm volatile (
+		"int $0x12\n");
+	/* not sure if we ever come back to this point */
+
+	return 1;
+}
+
 static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	/*
@@ -1367,6 +1381,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
+	[SVM_EXIT_EXCP_BASE + MC_VECTOR] 	= mc_interception,
 	[SVM_EXIT_INTR] 			= nop_on_interception,
 	[SVM_EXIT_NMI]				= nop_on_interception,
 	[SVM_EXIT_SMI]				= nop_on_interception,
-- 
cgit v1.2.3


From 2714d1d3d6be882b97cd0125140fccf9976a460a Mon Sep 17 00:00:00 2001
From: "Feng (Eric) Liu" <eric.e.liu@intel.com>
Date: Thu, 10 Apr 2008 15:31:10 -0400
Subject: KVM: Add trace markers

Trace markers allow userspace to trace execution of a virtual machine
in order to monitor its performance.

Signed-off-by: Feng (Eric) Liu <eric.e.liu@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6249810b2155..8e5d6645b90d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1843,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
@@ -1993,6 +1995,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
+		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
+			    (u32)((u64)cr2 >> 32), handler);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
 
@@ -2021,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
 				     struct kvm_run *kvm_run)
 {
 	++vcpu->stat.irq_exits;
+	KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
 	return 1;
 }
 
@@ -2078,6 +2083,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	reg = (exit_qualification >> 8) & 15;
 	switch ((exit_qualification >> 4) & 3) {
 	case 0: /* mov to cr */
+		KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
+			    (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
 		switch (cr) {
 		case 0:
 			vcpu_load_rsp_rip(vcpu);
@@ -2110,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->arch.cr0 &= ~X86_CR0_TS;
 		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 		vmx_fpu_activate(vcpu);
+		KVMTRACE_0D(CLTS, vcpu, handler);
 		skip_emulated_instruction(vcpu);
 		return 1;
 	case 1: /*mov from cr*/
@@ -2118,12 +2126,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			vcpu_load_rsp_rip(vcpu);
 			vcpu->arch.regs[reg] = vcpu->arch.cr3;
 			vcpu_put_rsp_rip(vcpu);
+			KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
+				    (u32)vcpu->arch.regs[reg],
+				    (u32)((u64)vcpu->arch.regs[reg] >> 32),
+				    handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
 			vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
 			vcpu_put_rsp_rip(vcpu);
+			KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
+				    (u32)vcpu->arch.regs[reg], handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		}
@@ -2169,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			val = 0;
 		}
 		vcpu->arch.regs[reg] = val;
+		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
 		/* mov to dr */
 	}
@@ -2193,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
+	KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
+		    handler);
+
 	/* FIXME: handling of bits 32:63 of rax, rdx */
 	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
 	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
@@ -2206,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
 		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
+	KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
+		    handler);
+
 	if (vmx_set_msr(vcpu, ecx, data) != 0) {
 		kvm_inject_gp(vcpu, 0);
 		return 1;
@@ -2230,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+	KVMTRACE_0D(PEND_INTR, vcpu, handler);
+
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
@@ -2272,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
 	offset = exit_qualification & 0xffful;
 
+	KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
+
 	er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
 
 	if (er !=  EMULATE_DONE) {
@@ -2335,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
+	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
+		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+
 	if (unlikely(vmx->fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2416,6 +2445,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			return;
 		}
 
+		KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
+
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2601,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
+		KVMTRACE_0D(NMI, vcpu, handler);
 		asm("int $2");
+	}
 }
 
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c7ad2352227a..f070f0a9adee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -303,6 +303,9 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
 	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+	KVMTRACE_1D(LMSW, vcpu,
+		    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
+		    handler);
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -2269,6 +2272,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.guest_page_offset = 0;
 	vcpu->arch.pio.rep = 0;
 
+	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+		KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+			    handler);
+	else
+		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+			    handler);
+
 	kvm_x86_ops->cache_regs(vcpu);
 	memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
 	kvm_x86_ops->decache_regs(vcpu);
@@ -2307,6 +2317,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
 	vcpu->arch.pio.rep = rep;
 
+	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+		KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+			    handler);
+	else
+		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+			    handler);
+
 	if (!count) {
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 		return 1;
@@ -2414,6 +2431,7 @@ void kvm_arch_exit(void)
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
+	KVMTRACE_0D(HLT, vcpu, handler);
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
 		up_read(&vcpu->kvm->slots_lock);
@@ -2451,6 +2469,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
 	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
 
+	KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+
 	if (!is_long_mode(vcpu)) {
 		nr &= 0xFFFFFFFF;
 		a0 &= 0xFFFFFFFF;
@@ -2639,6 +2659,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 	}
 	kvm_x86_ops->decache_regs(vcpu);
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	KVMTRACE_5D(CPUID, vcpu, function,
+		    (u32)vcpu->arch.regs[VCPU_REGS_RAX],
+		    (u32)vcpu->arch.regs[VCPU_REGS_RBX],
+		    (u32)vcpu->arch.regs[VCPU_REGS_RCX],
+		    (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -2794,6 +2819,7 @@ again:
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
 			kvm_x86_ops->tlb_flush(vcpu);
 
+	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
 	vcpu->guest_mode = 0;
-- 
cgit v1.2.3


From d4c9ff2d1b78e385471b3f4d80c0596909926ef7 Mon Sep 17 00:00:00 2001
From: "Feng(Eric) Liu" <eric.e.liu@intel.com>
Date: Thu, 10 Apr 2008 08:47:53 -0400
Subject: KVM: Add kvm trace userspace interface

This interface allows user a space application to read the trace of kvm
related events through relayfs.

Signed-off-by: Feng (Eric) Liu <eric.e.liu@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/Kconfig  | 11 +++++++++++
 arch/x86/kvm/Makefile |  3 +++
 2 files changed, 14 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 76c70ab44382..8d45fabc5f3b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,17 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+config KVM_TRACE
+	bool "KVM trace support"
+	depends on KVM && MARKERS && SYSFS
+	select RELAY
+	select DEBUG_FS
+	default n
+	---help---
+	  This option allows reading a trace of kvm-related events through
+	  relayfs.  Note the ABI is not considered stable and will be
+	  modified in future updates.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4d0c22e11f1a..c97d35c218db 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,6 +3,9 @@
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+ifeq ($(CONFIG_KVM_TRACE),y)
+common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
-- 
cgit v1.2.3


From 3564990af1b9f77a63692c1079e9c41af229f066 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 9 Apr 2008 16:04:32 +0200
Subject: KVM: SVM: do not intercept task switch with NPT

When KVM uses NPT there is no reason to intercept task switches. This patch
removes the intercept for it in that case.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index da3ddef47605..8d04aed72f3a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -591,6 +591,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
+		control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
 		control->intercept_exceptions &= ~(1 << PF_VECTOR);
 		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
 						INTERCEPT_CR3_MASK);
-- 
cgit v1.2.3


From 3d80840d96127401ba6aeadd813c3a15b84e70fe Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 11 Apr 2008 14:53:26 -0300
Subject: KVM: hlt emulation should take in-kernel APIC/PIT timers into account

Timers that fire between guest hlt and vcpu_block's add_wait_queue() are
ignored, possibly resulting in hangs.

Also make sure that atomic_inc and waitqueue_active tests happen in the
specified order, otherwise the following race is open:

CPU0                                        CPU1
                                            if (waitqueue_active(wq))
add_wait_queue()
if (!atomic_read(pit_timer->pending))
    schedule()
                                            atomic_inc(pit_timer->pending)

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 10 ++++++++++
 arch/x86/kvm/irq.c   | 15 +++++++++++++++
 arch/x86/kvm/irq.h   |  3 +++
 arch/x86/kvm/lapic.c | 10 ++++++++++
 4 files changed, 38 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 9f118e2f350d..ed1af80432b3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -212,6 +212,16 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 	return (pt->period == 0 ? 0 : 1);
 }
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+
+	if (pit && vcpu->vcpu_id == 0)
+		return atomic_read(&pit->pit_state.pit_timer.pending);
+
+	return 0;
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index dbfe21c99c48..ce1f583459b1 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -25,6 +25,21 @@
 #include "irq.h"
 #include "i8254.h"
 
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	ret = pit_has_pending_timer(vcpu);
+	ret |= apic_has_pending_timer(vcpu);
+
+	return ret;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
 /*
  * check if there is pending interrupt without
  * intack.
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index fa5ed5d59b5d..1802134b836f 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31280df7d2e3..debf58211bdd 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -952,6 +952,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 	return result;
 }
 
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *lapic = vcpu->arch.apic;
+
+	if (lapic)
+		return atomic_read(&lapic->timer.pending);
+
+	return 0;
+}
+
 static int __inject_apic_timer_irq(struct kvm_lapic *apic)
 {
 	int vector;
-- 
cgit v1.2.3


From a45352908b88d383bc40e1e4d1a6cc5bbcefc895 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 13 Apr 2008 17:54:35 +0300
Subject: KVM: Rename VCPU_MP_STATE_* to KVM_MP_STATE_*

We wish to export it to userspace, so move it into the kvm namespace.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c |  2 +-
 arch/x86/kvm/lapic.c | 16 ++++++++--------
 arch/x86/kvm/x86.c   | 18 +++++++++---------
 3 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ed1af80432b3..361e31611276 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -202,7 +202,7 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 	smp_mb__after_atomic_inc();
 	/* FIXME: handle case where the guest is in guest mode */
 	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-		vcpu0->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		wake_up_interruptible(&vcpu0->wq);
 	}
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index debf58211bdd..2ccf994dfc16 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 			kvm_vcpu_kick(vcpu);
-		else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
-			vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
+			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 			if (waitqueue_active(&vcpu->wq))
 				wake_up_interruptible(&vcpu->wq);
 		}
@@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 	case APIC_DM_INIT:
 		if (level) {
-			if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+			if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 				printk(KERN_DEBUG
 				       "INIT on a runnable vcpu %d\n",
 				       vcpu->vcpu_id);
-			vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_vcpu_kick(vcpu);
 		} else {
 			printk(KERN_DEBUG
@@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_STARTUP:
 		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
 		       vcpu->vcpu_id, vector);
-		if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			vcpu->arch.sipi_vector = vector;
-			vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
 			if (waitqueue_active(&vcpu->wq))
 				wake_up_interruptible(&vcpu->wq);
 		}
@@ -940,7 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 
 	atomic_inc(&apic->timer.pending);
 	if (waitqueue_active(q)) {
-		apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		wake_up_interruptible(q);
 	}
 	if (apic_lvtt_period(apic)) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f070f0a9adee..b364d192896c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2433,11 +2433,11 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 	++vcpu->stat.halt_exits;
 	KVMTRACE_0D(HLT, vcpu, handler);
 	if (irqchip_in_kernel(vcpu->kvm)) {
-		vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		up_read(&vcpu->kvm->slots_lock);
 		kvm_vcpu_block(vcpu);
 		down_read(&vcpu->kvm->slots_lock);
-		if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
 			return -EINTR;
 		return 1;
 	} else {
@@ -2726,14 +2726,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 
-	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
 		r = kvm_x86_ops->vcpu_reset(vcpu);
 		if (r)
 			return r;
-		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	}
 
 	down_read(&vcpu->kvm->slots_lock);
@@ -2891,7 +2891,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	vcpu_load(vcpu);
 
-	if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		vcpu_put(vcpu);
 		return -EAGAIN;
@@ -3794,9 +3794,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
-		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-		vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
@@ -3936,8 +3936,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
-	       || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
 }
 
 static void vcpu_kick_intr(void *info)
-- 
cgit v1.2.3


From 62d9f0dbc92d7e398fde53fc6021338393522e68 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 11 Apr 2008 13:24:45 -0300
Subject: KVM: add ioctls to save/store mpstate

So userspace can save/restore the mpstate during migration.

[avi: export the #define constants describing the value]
[christian: add s390 stubs]
[avi: ditto for ia64]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b364d192896c..5c3c9d38c780 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -817,6 +817,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
+	case KVM_CAP_MP_STATE:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
@@ -3083,6 +3084,24 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	vcpu_load(vcpu);
+	mp_state->mp_state = vcpu->arch.mp_state;
+	vcpu_put(vcpu);
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	vcpu_load(vcpu);
+	vcpu->arch.mp_state = mp_state->mp_state;
+	vcpu_put(vcpu);
+	return 0;
+}
+
 static void set_segment(struct kvm_vcpu *vcpu,
 			struct kvm_segment *var, int seg)
 {
-- 
cgit v1.2.3


From e9571ed54b2a290d61b98ad6f369f963159fe6da Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 11 Apr 2008 15:01:22 -0300
Subject: KVM: fix kvm_vcpu_kick vs __vcpu_run race

There is a window open between testing of pending IRQ's
and assignment of guest_mode in __vcpu_run.

Injection of IRQ's can race with __vcpu_run as follows:

CPU0                                CPU1
kvm_x86_ops->run()
vcpu->guest_mode = 0                SET_IRQ_LINE ioctl
..
kvm_x86_ops->inject_pending_irq
kvm_cpu_has_interrupt()

                                    apic_test_and_set_irr()
                                    kvm_vcpu_kick
                                    if (vcpu->guest_mode)
                                        send_ipi()

vcpu->guest_mode = 1

So move guest_mode=1 assignment before ->inject_pending_irq, and make
sure that it won't reorder after it.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5c3c9d38c780..0ce556372a4d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2802,6 +2802,13 @@ again:
 		goto out;
 	}
 
+	vcpu->guest_mode = 1;
+	/*
+	 * Make sure that guest_mode assignment won't happen after
+	 * testing the pending IRQ vector bitmap.
+	 */
+	smp_wmb();
+
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else if (irqchip_in_kernel(vcpu->kvm))
@@ -2813,7 +2820,6 @@ again:
 
 	up_read(&vcpu->kvm->slots_lock);
 
-	vcpu->guest_mode = 1;
 	kvm_guest_enter();
 
 	if (vcpu->requests)
@@ -3970,11 +3976,17 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int ipi_pcpu = vcpu->cpu;
+	int cpu = get_cpu();
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	if (vcpu->guest_mode)
+	/*
+	 * We may be called synchronously with irqs disabled in guest mode,
+	 * So need not to call smp_call_function_single() in that case.
+	 */
+	if (vcpu->guest_mode && vcpu->cpu != cpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+	put_cpu();
 }
-- 
cgit v1.2.3


From a79d2f1805da02d7837ec2240f0093c53272fb3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Apr 2008 13:10:21 +0300
Subject: KVM: SVM: force a new asid when initializing the vmcb

Shutdown interception clears the vmcb, leaving the asid at zero (which is
illegal.  so force a new asid on vmcb initialization.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8d04aed72f3a..3379e13d9b2c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -603,7 +603,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		save->cr3 = 0;
 		save->cr4 = 0;
 	}
-
+	force_new_asid(&svm->vcpu);
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 66b85505736dbd3a3a0ed5ae38c12bb218b231c0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Apr 2008 23:27:07 +0300
Subject: KVM: x86 emulator: initialize src.val and dst.val for register
 operands

This lets us treat the case where mod == 3 in the same manner as other cases.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f59ed93f5d24..8e1b32f2cd5e 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1001,6 +1001,7 @@ done_prefixes:
 		 */
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->src.type = OP_REG;
+			c->src.val = c->modrm_val;
 			break;
 		}
 		c->src.type = OP_MEM;
@@ -1044,6 +1045,7 @@ done_prefixes:
 	case DstMem:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->dst.type = OP_REG;
+			c->dst.val = c->dst.orig_val = c->modrm_val;
 			break;
 		}
 		c->dst.type = OP_MEM;
-- 
cgit v1.2.3


From 16286d082d99cb41e16938fa6ba84604229f4b77 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Apr 2008 14:40:50 +0300
Subject: KVM: x86 emulator: fix smsw and lmsw with a memory operand

lmsw and smsw were implemented only with a register operand.  Extend them
to support a memory operand as well.  Fixes Windows running some display
compatibility test on AMD hosts.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8e1b32f2cd5e..46ef78f8bb35 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -275,12 +275,15 @@ static u16 group_table[] = {
 	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem, ModRM | SrcMem,
-	SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, SrcMem | ModRM | ByteOp,
+	SrcNone | ModRM | DstMem | Mov, 0,
+	SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
 
 static u16 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM, 0, 0, 0, SrcNone | ModRM | DstMem, 0, SrcMem | ModRM, 0,
+	SrcNone | ModRM, 0, 0, 0,
+	SrcNone | ModRM | DstMem | Mov, 0,
+	SrcMem16 | ModRM | Mov, 0,
 };
 
 /* EFLAGS bit definitions. */
@@ -1722,6 +1725,8 @@ twobyte_insn:
 				goto done;
 
 			kvm_emulate_hypercall(ctxt->vcpu);
+			/* Disable writeback. */
+			c->dst.type = OP_NONE;
 			break;
 		case 2: /* lgdt */
 			rc = read_descriptor(ctxt, ops, c->src.ptr,
@@ -1729,6 +1734,8 @@ twobyte_insn:
 			if (rc)
 				goto done;
 			realmode_lgdt(ctxt->vcpu, size, address);
+			/* Disable writeback. */
+			c->dst.type = OP_NONE;
 			break;
 		case 3: /* lidt/vmmcall */
 			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
@@ -1744,27 +1751,25 @@ twobyte_insn:
 					goto done;
 				realmode_lidt(ctxt->vcpu, size, address);
 			}
+			/* Disable writeback. */
+			c->dst.type = OP_NONE;
 			break;
 		case 4: /* smsw */
-			if (c->modrm_mod != 3)
-				goto cannot_emulate;
-			*(u16 *)&c->regs[c->modrm_rm]
-				= realmode_get_cr(ctxt->vcpu, 0);
+			c->dst.bytes = 2;
+			c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
 			break;
 		case 6: /* lmsw */
-			if (c->modrm_mod != 3)
-				goto cannot_emulate;
-			realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
-						  &ctxt->eflags);
+			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
+				      &ctxt->eflags);
 			break;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, memop);
+			/* Disable writeback. */
+			c->dst.type = OP_NONE;
 			break;
 		default:
 			goto cannot_emulate;
 		}
-		/* Disable writeback. */
-		c->dst.type = OP_NONE;
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
-- 
cgit v1.2.3


From f9b7aab35cc6c3542203354d9fc4ec8572074abc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Apr 2008 23:46:37 +0300
Subject: KVM: x86 emulator: fix lea to really get the effective address

We never hit this, since there is currently no reason to emulate lea.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 46ef78f8bb35..2ca08386f993 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1512,7 +1512,7 @@ special_insn:
 	case 0x88 ... 0x8b:	/* mov */
 		goto mov;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->modrm_val;
+		c->dst.val = c->modrm_ea;
 		break;
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-- 
cgit v1.2.3


From 649d68643ebf02f31859ffbb16676aa44c72e6e9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 16:51:15 +0200
Subject: KVM: SVM: sync TPR value to V_TPR field in the VMCB

This patch adds syncing of the lapic.tpr field to the V_TPR field of the VMCB.
With this change we can safely remove the CR8 read intercept.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3379e13d9b2c..f8ce36e6690c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -486,8 +486,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK |
-					INTERCEPT_CR8_MASK;
+					INTERCEPT_CR4_MASK;
 
 	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
@@ -1621,6 +1620,19 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
 
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 cr8;
+
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return;
+
+	cr8 = kvm_get_cr8(vcpu);
+	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1630,6 +1642,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	pre_svm_run(svm);
 
+	sync_lapic_to_cr8(vcpu);
+
 	save_host_msrs(vcpu);
 	fs_selector = read_fs();
 	gs_selector = read_gs();
-- 
cgit v1.2.3


From ec7cf6903ffced20098e2bcc27a184172836dfb9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 16:51:16 +0200
Subject: KVM: export kvm_lapic_set_tpr() to modules

This patch exports the kvm_lapic_set_tpr() function from the lapic code to
modules. It is required in the kvm-amd module to optimize CR8 intercepts.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/lapic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2ccf994dfc16..57ac4e4c556a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
 		     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
-- 
cgit v1.2.3


From d7bf8221a3037d0d0760a1ccf1833bda03213abf Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 16:51:17 +0200
Subject: KVM: SVM: sync V_TPR with LAPIC.TPR if CR8 write intercept is
 disabled

If the CR8 write intercept is disabled the V_TPR field of the VMCB needs to be
synced with the TPR field in the local apic.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f8ce36e6690c..ee2ee83f3c48 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1620,6 +1620,16 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
 
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+		kvm_lapic_set_tpr(vcpu, cr8);
+	}
+}
+
 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1791,6 +1801,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	stgi();
 
+	sync_cr8_to_lapic(vcpu);
+
 	svm->next_rip = 0;
 }
 
-- 
cgit v1.2.3


From aaacfc9ae225e88695e610a35627d2256dc08633 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 16:51:18 +0200
Subject: KVM: SVM: disable CR8 intercept when tpr is not masking interrupts

This patch disables the intercept of CR8 writes if the TPR is not masking
interrupts. This reduces the total number CR8 intercepts to below 1 percent of
what we have without this patch using Windows 64 bit guests.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ee2ee83f3c48..61bb2cb51215 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1502,6 +1502,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 	svm_inject_irq(svm, irq);
 }
 
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	int max_irr, tpr;
+
+	if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
+		return;
+
+	vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+
+	max_irr = kvm_lapic_find_highest_irr(vcpu);
+	if (max_irr == -1)
+		return;
+
+	tpr = kvm_lapic_get_cr8(vcpu) << 4;
+
+	if (tpr >= (max_irr & 0xf0))
+		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+}
+
 static void svm_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1514,14 +1535,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 			      SVM_EVTINJ_VEC_MASK;
 		vmcb->control.exit_int_info = 0;
 		svm_inject_irq(svm, intr_vector);
-		return;
+		goto out;
 	}
 
 	if (vmcb->control.int_ctl & V_IRQ_MASK)
-		return;
+		goto out;
 
 	if (!kvm_cpu_has_interrupt(vcpu))
-		return;
+		goto out;
 
 	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
 	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
@@ -1529,12 +1550,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 		/* unable to deliver irq, set pending irq */
 		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
 		svm_inject_irq(svm, 0x0);
-		return;
+		goto out;
 	}
 	/* Okay, we can deliver the interrupt: grab it and update PIC state. */
 	intr_vector = kvm_cpu_get_interrupt(vcpu);
 	svm_inject_irq(svm, intr_vector);
 	kvm_timer_intr_post(vcpu, intr_vector);
+out:
+	update_cr8_intercept(vcpu);
 }
 
 static void kvm_reput_irq(struct vcpu_svm *svm)
-- 
cgit v1.2.3


From aaf697e4e02bf6f7dd6105877bc58ebdbf612d66 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 16:51:19 +0200
Subject: KVM: SVM: remove now obsolete FIXME comment

With the usage of the V_TPR field this comment is now obsolete.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 61bb2cb51215..d643605c5aeb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -916,13 +916,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-/* FIXME:
-
-	svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
-	svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-
-*/
-
 static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 1336028b9a1fb33537eab8caec66e812eb8cad63 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Apr 2008 17:01:05 +0200
Subject: KVM: SVM: remove selective CR0 comment

There is not selective cr0 intercept bug. The code in the comment sets the
CR0.PG bit. But KVM sets the CR4.PG bit for SVM always to implement the paged
real mode. So the 'mov %eax,%cr0' instruction does not change the CR0.PG bit.
Selective CR0 intercepts only occur when a bit is actually changed. So its the
right behavior that there is no intercept on this instruction.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d643605c5aeb..89e0be2c10d0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -513,17 +513,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 	control->intercept = 	(1ULL << INTERCEPT_INTR) |
 				(1ULL << INTERCEPT_NMI) |
 				(1ULL << INTERCEPT_SMI) |
-		/*
-		 * selective cr0 intercept bug?
-		 *    	0:   0f 22 d8                mov    %eax,%cr3
-		 *	3:   0f 20 c0                mov    %cr0,%eax
-		 *	6:   0d 00 00 00 80          or     $0x80000000,%eax
-		 *	b:   0f 22 c0                mov    %eax,%cr0
-		 * set cr3 ->interception
-		 * get cr0 ->interception
-		 * set cr0 -> no interception
-		 */
-		/*              (1ULL << INTERCEPT_SELECTIVE_CR0) | */
 				(1ULL << INTERCEPT_CPUID) |
 				(1ULL << INTERCEPT_INVD) |
 				(1ULL << INTERCEPT_HLT) |
-- 
cgit v1.2.3


From 960b3991698872f68f09d51f4c2794ad484fe1fd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 16 Apr 2008 17:19:06 -0300
Subject: KVM: MMU: kvm_pv_mmu_op should not take mmap_sem

kvm_pv_mmu_op should not take mmap_sem. All gfn_to_page() callers down
in the MMU processing will take it if necessary, so as it is it can
deadlock.

Apparently a leftover from the days before slots_lock.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 078a7f1ac34c..2ad6f5481671 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2173,8 +2173,6 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 	int r;
 	struct kvm_pv_mmu_op_buffer buffer;
 
-	down_read(&current->mm->mmap_sem);
-
 	buffer.ptr = buffer.buf;
 	buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
 	buffer.processed = 0;
@@ -2194,7 +2192,6 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 	r = 1;
 out:
 	*ret = buffer.processed;
-	up_read(&current->mm->mmap_sem);
 	return r;
 }
 
-- 
cgit v1.2.3