3 files changed, 144 insertions, 63 deletions
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5261363adda3..118e5c2379f9 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 unsigned int kvm_arch_para_hints(void);
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
+void kvm_async_pf_task_wait_schedule(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 void kvm_disable_steal_time(void);
@@ -113,7 +113,7 @@ static inline void kvm_spinlock_init(void)
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
 
 #else /* CONFIG_KVM_GUEST */
-#define kvm_async_pf_task_wait(T, I) do {} while(0)
+#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
 static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5ad3fcca2309..c6a82f9f537f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,7 +75,7 @@ struct kvm_task_sleep_node {
 	struct swait_queue_head wq;
 	u32 token;
 	int cpu;
-	bool halted;
+	bool use_halt;
 };
 
 static struct kvm_task_sleep_head {
@@ -98,75 +98,145 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 	return NULL;
 }
 
-/*
- * @interrupt_kernel: Is this called from a routine which interrupts the kernel
- * 		      (other than user space)?
- */
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
+				    struct kvm_task_sleep_node *n)
 {
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
-	struct kvm_task_sleep_node n, *e;
-	DECLARE_SWAITQUEUE(wait);
-
-	rcu_irq_enter();
+	struct kvm_task_sleep_node *e;
 
 	raw_spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
 	if (e) {
 		/* dummy entry exist -> wake up was delivered ahead of PF */
 		hlist_del(&e->link);
-		kfree(e);
 		raw_spin_unlock(&b->lock);
+		kfree(e);
+		return false;
+	}
 
-		rcu_irq_exit();
+	n->token = token;
+	n->cpu = smp_processor_id();
+	n->use_halt = use_halt;
+	init_swait_queue_head(&n->wq);
+	hlist_add_head(&n->link, &b->list);
+	raw_spin_unlock(&b->lock);
+	return true;
+}
+
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token:	Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
+{
+	struct kvm_task_sleep_node n;
+	DECLARE_SWAITQUEUE(wait);
+
+	lockdep_assert_irqs_disabled();
+
+	if (!kvm_async_pf_queue_task(token, false, &n))
 		return;
+
+	for (;;) {
+		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (hlist_unhashed(&n.link))
+			break;
+
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
 	}
+	finish_swait(&n.wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
 
-	n.token = token;
-	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) ||
-		   (IS_ENABLED(CONFIG_PREEMPT_COUNT)
-		    ? preempt_count() > 1 || rcu_preempt_depth()
-		    : interrupt_kernel);
-	init_swait_queue_head(&n.wq);
-	hlist_add_head(&n.link, &b->list);
-	raw_spin_unlock(&b->lock);
+/*
+ * Invoked from the async page fault handler.
+ */
+static void kvm_async_pf_task_wait_halt(u32 token)
+{
+	struct kvm_task_sleep_node n;
+
+	if (!kvm_async_pf_queue_task(token, true, &n))
+		return;
 
 	for (;;) {
-		if (!n.halted)
-			prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
+		/*
+		 * No point in doing anything about RCU here. Any RCU read
+		 * side critical section or RCU watching section can be
+		 * interrupted by VMEXITs and the host is free to keep the
+		 * vCPU scheduled out as long as it sees fit. This is not
+		 * any different just because of the halt induced voluntary
+		 * VMEXIT.
+		 *
+		 * Also the async page fault could have interrupted any RCU
+		 * watching context, so invoking rcu_irq_exit()/enter()
+		 * around this is not gaining anything.
+		 */
+		native_safe_halt();
+		local_irq_disable();
+	}
+}
 
-		rcu_irq_exit();
+/* Invoked from the async page fault handler */
+static void kvm_async_pf_task_wait(u32 token, bool usermode)
+{
+	bool can_schedule;
 
-		if (!n.halted) {
-			local_irq_enable();
-			schedule();
-			local_irq_disable();
-		} else {
-			/*
-			 * We cannot reschedule. So halt.
-			 */
-			native_safe_halt();
-			local_irq_disable();
-		}
+	/*
+	 * No need to check whether interrupts were disabled because the
+	 * host will (hopefully) only inject an async page fault into
+	 * interrupt enabled regions.
+	 *
+	 * If CONFIG_PREEMPTION is enabled then check whether the code
+	 * which triggered the page fault is preemptible. This covers user
+	 * mode as well because preempt_count() is obviously 0 there.
+	 *
+	 * The check for rcu_preempt_depth() is also required because
+	 * voluntary scheduling inside a rcu read locked section is not
+	 * allowed.
+	 *
+	 * The idle task is already covered by this because idle always
+	 * has a preempt count > 0.
+	 *
+	 * If CONFIG_PREEMPTION is disabled only allow scheduling when
+	 * coming from user mode as there is no indication whether the
+	 * context which triggered the page fault could schedule or not.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		can_schedule = preempt_count() + rcu_preempt_depth() == 0;
+	else
+		can_schedule = usermode;
 
+	/*
+	 * If the kernel context is allowed to schedule then RCU is
+	 * watching because no preemptible code in the kernel is inside RCU
+	 * idle state. So it can be treated like user mode. User mode is
+	 * safe because the #PF entry invoked enter_from_user_mode().
+	 *
+	 * For the non schedulable case invoke rcu_irq_enter() for
+	 * now. This will be moved out to the pagefault entry code later
+	 * and only invoked when really needed.
+	 */
+	if (can_schedule) {
+		kvm_async_pf_task_wait_schedule(token);
+	} else {
 		rcu_irq_enter();
+		kvm_async_pf_task_wait_halt(token);
+		rcu_irq_exit();
 	}
-	if (!n.halted)
-		finish_swait(&n.wq, &wait);
-
-	rcu_irq_exit();
-	return;
 }
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
 	hlist_del_init(&n->link);
-	if (n->halted)
+	if (n->use_halt)
 		smp_send_reschedule(n->cpu);
 	else if (swq_has_sleeper(&n->wq))
 		swake_up_one(&n->wq);
@@ -177,12 +247,13 @@ static void apf_task_wake_all(void)
 	int i;
 
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
-		struct hlist_node *p, *next;
 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+		struct kvm_task_sleep_node *n;
+		struct hlist_node *p, *next;
+
 		raw_spin_lock(&b->lock);
 		hlist_for_each_safe(p, next, &b->list) {
-			struct kvm_task_sleep_node *n =
-				hlist_entry(p, typeof(*n), link);
+			n = hlist_entry(p, typeof(*n), link);
 			if (n->cpu == smp_processor_id())
 				apf_task_wake_one(n);
 		}
@@ -223,8 +294,9 @@ again:
 		n->cpu = smp_processor_id();
 		init_swait_queue_head(&n->wq);
 		hlist_add_head(&n->link, &b->list);
-	} else
+	} else {
 		apf_task_wake_one(n);
+	}
 	raw_spin_unlock(&b->lock);
 	return;
 }
@@ -246,23 +318,33 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
-	/*
-	 * If we get a page fault right here, the pf_reason seems likely
-	 * to be clobbered.  Bummer.
-	 */
-	switch (kvm_read_and_reset_pf_reason()) {
+	u32 reason = kvm_read_and_reset_pf_reason();
+
+	switch (reason) {
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+	case KVM_PV_REASON_PAGE_READY:
+		break;
 	default:
 		return false;
-	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+	}
+
+	/*
+	 * If the host managed to inject an async #PF into an interrupt
+	 * disabled region, then die hard as this is not going to end well
+	 * and the host side is seriously broken.
+	 */
+	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+		panic("Host injected async #PF in interrupt disabled region\n");
+
+	if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
 		/* page is swapped out by the host. */
-		kvm_async_pf_task_wait(token, !user_mode(regs));
-		return true;
-	case KVM_PV_REASON_PAGE_READY:
+		kvm_async_pf_task_wait(token, user_mode(regs));
+	} else {
 		rcu_irq_enter();
 		kvm_async_pf_task_wake(token);
 		rcu_irq_exit();
-		return true;
 	}
+	return true;
 }
 NOKPROBE_SYMBOL(__kvm_handle_async_pf);
 
@@ -326,12 +408,12 @@ static void kvm_guest_cpu_init(void)
 
 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
 		__this_cpu_write(apf_reason.enabled, 1);
-		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
-		       smp_processor_id());
+		pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
 		unsigned long pa;
+
 		/* Size alignment is implied but just to make it explicit. */
 		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
 		__this_cpu_write(kvm_apic_eoi, 0);
@@ -352,8 +434,7 @@ static void kvm_pv_disable_apf(void)
 	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 	__this_cpu_write(apf_reason.enabled, 0);
 
-	printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
-	       smp_processor_id());
+	pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
 }
 
 static void kvm_pv_guest_cpu_reboot(void *unused)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..dd900a648059 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4198,7 +4198,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		vcpu->arch.apf.host_apf_reason = 0;
 		local_irq_disable();
-		kvm_async_pf_task_wait(fault_address, 0);
+		kvm_async_pf_task_wait_schedule(fault_address);
 		local_irq_enable();
 		break;
 	case KVM_PV_REASON_PAGE_READY: