summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/kvm_para.h4
-rw-r--r--arch/x86/kernel/kvm.c201
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
3 files changed, 144 insertions, 63 deletions
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5261363adda3..118e5c2379f9 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
+void kvm_async_pf_task_wait_schedule(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
void kvm_disable_steal_time(void);
@@ -113,7 +113,7 @@ static inline void kvm_spinlock_init(void)
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
#else /* CONFIG_KVM_GUEST */
-#define kvm_async_pf_task_wait(T, I) do {} while(0)
+#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
#define kvm_async_pf_task_wake(T) do {} while(0)
static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5ad3fcca2309..c6a82f9f537f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,7 +75,7 @@ struct kvm_task_sleep_node {
struct swait_queue_head wq;
u32 token;
int cpu;
- bool halted;
+ bool use_halt;
};
static struct kvm_task_sleep_head {
@@ -98,75 +98,145 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
return NULL;
}
-/*
- * @interrupt_kernel: Is this called from a routine which interrupts the kernel
- * (other than user space)?
- */
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
+ struct kvm_task_sleep_node *n)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
- struct kvm_task_sleep_node n, *e;
- DECLARE_SWAITQUEUE(wait);
-
- rcu_irq_enter();
+ struct kvm_task_sleep_node *e;
raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
- kfree(e);
raw_spin_unlock(&b->lock);
+ kfree(e);
+ return false;
+ }
- rcu_irq_exit();
+ n->token = token;
+ n->cpu = smp_processor_id();
+ n->use_halt = use_halt;
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ raw_spin_unlock(&b->lock);
+ return true;
+}
+
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token: Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
+{
+ struct kvm_task_sleep_node n;
+ DECLARE_SWAITQUEUE(wait);
+
+ lockdep_assert_irqs_disabled();
+
+ if (!kvm_async_pf_queue_task(token, false, &n))
return;
+
+ for (;;) {
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
}
+ finish_swait(&n.wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
- n.token = token;
- n.cpu = smp_processor_id();
- n.halted = is_idle_task(current) ||
- (IS_ENABLED(CONFIG_PREEMPT_COUNT)
- ? preempt_count() > 1 || rcu_preempt_depth()
- : interrupt_kernel);
- init_swait_queue_head(&n.wq);
- hlist_add_head(&n.link, &b->list);
- raw_spin_unlock(&b->lock);
+/*
+ * Invoked from the async page fault handler.
+ */
+static void kvm_async_pf_task_wait_halt(u32 token)
+{
+ struct kvm_task_sleep_node n;
+
+ if (!kvm_async_pf_queue_task(token, true, &n))
+ return;
for (;;) {
- if (!n.halted)
- prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
+ /*
+ * No point in doing anything about RCU here. Any RCU read
+ * side critical section or RCU watching section can be
+ * interrupted by VMEXITs and the host is free to keep the
+ * vCPU scheduled out as long as it sees fit. This is not
+ * any different just because of the halt induced voluntary
+ * VMEXIT.
+ *
+ * Also the async page fault could have interrupted any RCU
+ * watching context, so invoking rcu_irq_exit()/enter()
+ * around this is not gaining anything.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+}
- rcu_irq_exit();
+/* Invoked from the async page fault handler */
+static void kvm_async_pf_task_wait(u32 token, bool usermode)
+{
+ bool can_schedule;
- if (!n.halted) {
- local_irq_enable();
- schedule();
- local_irq_disable();
- } else {
- /*
- * We cannot reschedule. So halt.
- */
- native_safe_halt();
- local_irq_disable();
- }
+ /*
+ * No need to check whether interrupts were disabled because the
+ * host will (hopefully) only inject an async page fault into
+ * interrupt enabled regions.
+ *
+ * If CONFIG_PREEMPTION is enabled then check whether the code
+ * which triggered the page fault is preemptible. This covers user
+ * mode as well because preempt_count() is obviously 0 there.
+ *
+ * The check for rcu_preempt_depth() is also required because
+ * voluntary scheduling inside a rcu read locked section is not
+ * allowed.
+ *
+ * The idle task is already covered by this because idle always
+ * has a preempt count > 0.
+ *
+ * If CONFIG_PREEMPTION is disabled only allow scheduling when
+ * coming from user mode as there is no indication whether the
+ * context which triggered the page fault could schedule or not.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ can_schedule = preempt_count() + rcu_preempt_depth() == 0;
+ else
+ can_schedule = usermode;
+ /*
+ * If the kernel context is allowed to schedule then RCU is
+ * watching because no preemptible code in the kernel is inside RCU
+ * idle state. So it can be treated like user mode. User mode is
+ * safe because the #PF entry invoked enter_from_user_mode().
+ *
+ * For the non schedulable case invoke rcu_irq_enter() for
+ * now. This will be moved out to the pagefault entry code later
+ * and only invoked when really needed.
+ */
+ if (can_schedule) {
+ kvm_async_pf_task_wait_schedule(token);
+ } else {
rcu_irq_enter();
+ kvm_async_pf_task_wait_halt(token);
+ rcu_irq_exit();
}
- if (!n.halted)
- finish_swait(&n.wq, &wait);
-
- rcu_irq_exit();
- return;
}
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
- if (n->halted)
+ if (n->use_halt)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
swake_up_one(&n->wq);
@@ -177,12 +247,13 @@ static void apf_task_wake_all(void)
int i;
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
- struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ struct kvm_task_sleep_node *n;
+ struct hlist_node *p, *next;
+
raw_spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
- struct kvm_task_sleep_node *n =
- hlist_entry(p, typeof(*n), link);
+ n = hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
@@ -223,8 +294,9 @@ again:
n->cpu = smp_processor_id();
init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
- } else
+ } else {
apf_task_wake_one(n);
+ }
raw_spin_unlock(&b->lock);
return;
}
@@ -246,23 +318,33 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
- /*
- * If we get a page fault right here, the pf_reason seems likely
- * to be clobbered. Bummer.
- */
- switch (kvm_read_and_reset_pf_reason()) {
+ u32 reason = kvm_read_and_reset_pf_reason();
+
+ switch (reason) {
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ case KVM_PV_REASON_PAGE_READY:
+ break;
default:
return false;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ }
+
+ /*
+ * If the host managed to inject an async #PF into an interrupt
+ * disabled region, then die hard as this is not going to end well
+ * and the host side is seriously broken.
+ */
+ if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+ panic("Host injected async #PF in interrupt disabled region\n");
+
+ if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
/* page is swapped out by the host. */
- kvm_async_pf_task_wait(token, !user_mode(regs));
- return true;
- case KVM_PV_REASON_PAGE_READY:
+ kvm_async_pf_task_wait(token, user_mode(regs));
+ } else {
rcu_irq_enter();
kvm_async_pf_task_wake(token);
rcu_irq_exit();
- return true;
}
+ return true;
}
NOKPROBE_SYMBOL(__kvm_handle_async_pf);
@@ -326,12 +408,12 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- printk(KERN_INFO"KVM setup async PF for cpu %d\n",
- smp_processor_id());
+ pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
unsigned long pa;
+
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
__this_cpu_write(kvm_apic_eoi, 0);
@@ -352,8 +434,7 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
- smp_processor_id());
+ pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
}
static void kvm_pv_guest_cpu_reboot(void *unused)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..dd900a648059 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4198,7 +4198,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
case KVM_PV_REASON_PAGE_NOT_PRESENT:
vcpu->arch.apf.host_apf_reason = 0;
local_irq_disable();
- kvm_async_pf_task_wait(fault_address, 0);
+ kvm_async_pf_task_wait_schedule(fault_address);
local_irq_enable();
break;
case KVM_PV_REASON_PAGE_READY: