From 05be18241e83d2ac6b656c8f924e74b3998c173f Mon Sep 17 00:00:00 2001
From: Jan Seiffert <kaffeemonster@googlemail.com>
Date: Sun, 29 Apr 2012 19:02:19 +0000
Subject: bpf jit: Let the powerpc jit handle negative offsets

Now the helper function from filter.c for negative offsets is exported,
it can be used it in the jit to handle negative offsets.

First modify the asm load helper functions to handle:
- know positive offsets
- know negative offsets
- any offset

then the compiler can be modified to explicitly use these helper
when appropriate.

This fixes the case of a negative X register and allows to lift
the restriction that bpf programs with negative offsets can't
be jited.

Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/net/bpf_jit.h      |   8 ++-
 arch/powerpc/net/bpf_jit_64.S   | 108 +++++++++++++++++++++++++++++++++++-----
 arch/powerpc/net/bpf_jit_comp.c |  26 ++++------
 3 files changed, 111 insertions(+), 31 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index af1ab5e9a691..5c3cf2d04e41 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -48,7 +48,13 @@
 /*
  * Assembly helpers from arch/powerpc/net/bpf_jit.S:
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+#define DECLARE_LOAD_FUNC(func)	\
+	extern u8 func[], func##_negative_offset[], func##_positive_offset[]
+
+DECLARE_LOAD_FUNC(sk_load_word);
+DECLARE_LOAD_FUNC(sk_load_half);
+DECLARE_LOAD_FUNC(sk_load_byte);
+DECLARE_LOAD_FUNC(sk_load_byte_msh);
 
 #define FUNCTION_DESCR_SIZE	24
 
diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
index ff4506e85cce..55ba3855a97f 100644
--- a/arch/powerpc/net/bpf_jit_64.S
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -31,14 +31,13 @@
  * then branch directly to slow_path_XXX if required.  (In fact, could
  * load a spare GPR with the address of slow_path_generic and pass size
  * as an argument, making the call site a mtlr, li and bllr.)
- *
- * Technically, the "is addr < 0" check is unnecessary & slowing down
- * the ABS path, as it's statically checked on generation.
  */
 	.globl	sk_load_word
 sk_load_word:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_word_neg
+	.globl	sk_load_word_positive_offset
+sk_load_word_positive_offset:
 	/* Are we accessing past headlen? */
 	subi	r_scratch1, r_HL, 4
 	cmpd	r_scratch1, r_addr
@@ -51,7 +50,9 @@ sk_load_word:
 	.globl	sk_load_half
 sk_load_half:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_half_neg
+	.globl	sk_load_half_positive_offset
+sk_load_half_positive_offset:
 	subi	r_scratch1, r_HL, 2
 	cmpd	r_scratch1, r_addr
 	blt	bpf_slow_path_half
@@ -61,7 +62,9 @@ sk_load_half:
 	.globl	sk_load_byte
 sk_load_byte:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_byte_neg
+	.globl	sk_load_byte_positive_offset
+sk_load_byte_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte
 	lbzx	r_A, r_D, r_addr
@@ -69,22 +72,20 @@ sk_load_byte:
 
 /*
  * BPF_S_LDX_B_MSH: ldxb  4*([offset]&0xf)
- * r_addr is the offset value, already known positive
+ * r_addr is the offset value
  */
 	.globl sk_load_byte_msh
 sk_load_byte_msh:
+	cmpdi	r_addr, 0
+	blt	bpf_slow_path_byte_msh_neg
+	.globl sk_load_byte_msh_positive_offset
+sk_load_byte_msh_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte_msh
 	lbzx	r_X, r_D, r_addr
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
 
-bpf_error:
-	/* Entered with cr0 = lt */
-	li	r3, 0
-	/* Generated code will 'blt epilogue', returning 0. */
-	blr
-
 /* Call out to skb_copy_bits:
  * We'll need to back up our volatile regs first; we have
  * local variable space at r1+(BPF_PPC_STACK_BASIC).
@@ -136,3 +137,84 @@ bpf_slow_path_byte_msh:
 	lbz	r_X, BPF_PPC_STACK_BASIC+(2*8)(r1)
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
+
+/* Call out to bpf_internal_load_pointer_neg_helper:
+ * We'll need to back up our volatile regs first; we have
+ * local variable space at r1+(BPF_PPC_STACK_BASIC).
+ * Allocate a new stack frame here to remain ABI-compliant in
+ * stashing LR.
+ */
+#define sk_negative_common(SIZE)				\
+	mflr	r0;						\
+	std	r0, 16(r1);					\
+	/* R3 goes in parameter space of caller's frame */	\
+	std	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	std	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	std	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	stdu	r1, -BPF_PPC_SLOWPATH_FRAME(r1);		\
+	/* R3 = r_skb, as passed */				\
+	mr	r4, r_addr;					\
+	li	r5, SIZE;					\
+	bl	bpf_internal_load_pointer_neg_helper;		\
+	/* R3 != 0 on success */				\
+	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
+	ld	r0, 16(r1);					\
+	ld	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	ld	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	mtlr	r0;						\
+	cmpldi	r3, 0;						\
+	beq	bpf_error_slow;	/* cr0 = EQ */			\
+	mr	r_addr, r3;					\
+	ld	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	/* Great success! */
+
+bpf_slow_path_word_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_word_negative_offset
+sk_load_word_negative_offset:
+	sk_negative_common(4)
+	lwz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_half_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_half_negative_offset
+sk_load_half_negative_offset:
+	sk_negative_common(2)
+	lhz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_negative_offset
+sk_load_byte_negative_offset:
+	sk_negative_common(1)
+	lbz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_msh_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_msh_negative_offset
+sk_load_byte_msh_negative_offset:
+	sk_negative_common(1)
+	lbz	r_X, 0(r_addr)
+	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
+	blr
+
+bpf_error_slow:
+	/* fabricate a cr0 = lt */
+	li	r_scratch1, -1
+	cmpdi	r_scratch1, 0
+bpf_error:
+	/* Entered with cr0 = lt */
+	li	r3, 0
+	/* Generated code will 'blt epilogue', returning 0. */
+	blr
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 73619d3aeb6c..2dc8b1484845 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -127,6 +127,9 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 	PPC_BLR();
 }
 
+#define CHOOSE_LOAD_FUNC(K, func) \
+	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
+
 /* Assemble the body code between the prologue & epilogue. */
 static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			      struct codegen_context *ctx,
@@ -391,21 +394,16 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 
 			/*** Absolute loads from packet header/data ***/
 		case BPF_S_LD_W_ABS:
-			func = sk_load_word;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
 			goto common_load;
 		case BPF_S_LD_H_ABS:
-			func = sk_load_half;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
 			goto common_load;
 		case BPF_S_LD_B_ABS:
-			func = sk_load_byte;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
 		common_load:
-			/*
-			 * Load from [K].  Reference with the (negative)
-			 * SKF_NET_OFF/SKF_LL_OFF offsets is unsupported.
-			 */
+			/* Load from [K]. */
 			ctx->seen |= SEEN_DATAREF;
-			if ((int)K < 0)
-				return -ENOTSUPP;
 			PPC_LI64(r_scratch1, func);
 			PPC_MTLR(r_scratch1);
 			PPC_LI32(r_addr, K);
@@ -429,7 +427,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 		common_load_ind:
 			/*
 			 * Load from [X + K].  Negative offsets are tested for
-			 * in the helper functions, and result in a 'ret 0'.
+			 * in the helper functions.
 			 */
 			ctx->seen |= SEEN_DATAREF | SEEN_XREG;
 			PPC_LI64(r_scratch1, func);
@@ -443,13 +441,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			break;
 
 		case BPF_S_LDX_B_MSH:
-			/*
-			 * x86 version drops packet (RET 0) when K<0, whereas
-			 * interpreter does allow K<0 (__load_pointer, special
-			 * ancillary data).  common_load returns ENOTSUPP if K<0,
-			 * so we fall back to interpreter & filter works.
-			 */
-			func = sk_load_byte_msh;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
 			goto common_load;
 			break;
 
-- 
cgit v1.2.3


From de6c0b02d4d7bdf2587e679a6ddbb71b7d68bb89 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 8 May 2012 20:24:08 +1000
Subject: KVM: PPC: Book3S HV: Fix refcounting of hugepages

The H_REGISTER_VPA hcall implementation in HV Power KVM needs to pin some
guest memory pages into host memory so that they can be safely accessed
from usermode.  It does this used get_user_pages_fast().  When the VPA is
unregistered, or the VCPUs are cleaned up, these pages are released using
put_page().

However, the get_user_pages() is invoked on the specific memory are of the
VPA which could lie within hugepages.  In case the pinned page is huge,
we explicitly find the head page of the compound page before calling
put_page() on it.

At least with the latest kernel, this is not correct.  put_page() already
handles finding the correct head page of a compound, and also deals with
various counts on the individual tail page which are important for
transparent huge pages.  We don't support transparent hugepages on Power,
but even so, bypassing this count maintenance can lead (when the VM ends)
to a hugepage being released back to the pool with a non-zero mapcount on
one of the tail pages.  This can then lead to a bad_page() when the page
is released from the hugepage pool.

This removes the explicit compound_head() call to correct this bug.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 22 +++++++++++++---------
 arch/powerpc/kvm/book3s_hv.c        |  2 --
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ddc485a529f2..c3beaeef3f60 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -258,6 +258,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 			    !(memslot->userspace_addr & (s - 1))) {
 				start &= ~(s - 1);
 				pgsize = s;
+				get_page(hpage);
+				put_page(page);
 				page = hpage;
 			}
 		}
@@ -281,11 +283,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 	err = 0;
 
  out:
-	if (got) {
-		if (PageHuge(page))
-			page = compound_head(page);
+	if (got)
 		put_page(page);
-	}
 	return err;
 
  up_err:
@@ -678,8 +677,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		SetPageDirty(page);
 
  out_put:
-	if (page)
-		put_page(page);
+	if (page) {
+		/*
+		 * We drop pages[0] here, not page because page might
+		 * have been set to the head page of a compound, but
+		 * we have to drop the reference on the correct tail
+		 * page to match the get inside gup()
+		 */
+		put_page(pages[0]);
+	}
 	return ret;
 
  out_unlock:
@@ -979,6 +985,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 			pa = *physp;
 		}
 		page = pfn_to_page(pa >> PAGE_SHIFT);
+		get_page(page);
 	} else {
 		hva = gfn_to_hva_memslot(memslot, gfn);
 		npages = get_user_pages_fast(hva, 1, 1, pages);
@@ -991,8 +998,6 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 		page = compound_head(page);
 		psize <<= compound_order(page);
 	}
-	if (!kvm->arch.using_mmu_notifiers)
-		get_page(page);
 	offset = gpa & (psize - 1);
 	if (nb_ret)
 		*nb_ret = psize - offset;
@@ -1003,7 +1008,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
 {
 	struct page *page = virt_to_page(va);
 
-	page = compound_head(page);
 	put_page(page);
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 01294a5099dd..108d1f580177 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1192,8 +1192,6 @@ static void unpin_slot(struct kvm *kvm, int slot_id)
 				continue;
 			pfn = physp[j] >> PAGE_SHIFT;
 			page = pfn_to_page(pfn);
-			if (PageHuge(page))
-				page = compound_head(page);
 			SetPageDirty(page);
 			put_page(page);
 		}
-- 
cgit v1.2.3


From 7c0482e3d055e5de056d3c693b821e39205b99ae Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 10 May 2012 16:12:38 +0000
Subject: powerpc/irq: Fix another case of lazy IRQ state getting out of sync

So we have another case of paca->irq_happened getting out of
sync with the HW irq state. This can happen when a perfmon
interrupt occurs while soft disabled, as it will return to a
soft disabled but hard enabled context while leaving a stale
PACA_IRQ_HARD_DIS flag set.

This patch fixes it, and also adds a test for the condition
of those flags being out of sync in arch_local_irq_restore()
when CONFIG_TRACE_IRQFLAGS is enabled.

This helps catching those gremlins faster (and so far I
can't seem see any anymore, so that's good news).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/entry_64.S | 44 +++++++++++++++++++++++++++++-------------
 arch/powerpc/kernel/irq.c      | 13 +++++++++++++
 2 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index fc6015027a86..ef2074c3e906 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -588,23 +588,19 @@ _GLOBAL(ret_from_except_lite)
 fast_exc_return_irq:
 restore:
 	/*
-	 * This is the main kernel exit path, we first check if we
-	 * have to change our interrupt state.
+	 * This is the main kernel exit path. First we check if we
+	 * are about to re-enable interrupts
 	 */
 	ld	r5,SOFTE(r1)
 	lbz	r6,PACASOFTIRQEN(r13)
-	cmpwi	cr1,r5,0
-	cmpw	cr0,r5,r6
-	beq	cr0,4f
+	cmpwi	cr0,r5,0
+	beq	restore_irq_off
 
-	/* We do, handle disable first, which is easy */
-	bne	cr1,3f;
- 	li	r0,0
-	stb	r0,PACASOFTIRQEN(r13);
-	TRACE_DISABLE_INTS
-	b	4f
+	/* We are enabling, were we already enabled ? Yes, just return */
+	cmpwi	cr0,r6,1
+	beq	cr0,do_restore
 
-3:	/*
+	/*
 	 * We are about to soft-enable interrupts (we are hard disabled
 	 * at this point). We check if there's anything that needs to
 	 * be replayed first.
@@ -626,7 +622,7 @@ restore_no_replay:
 	/*
 	 * Final return path. BookE is handled in a different file
 	 */
-4:
+do_restore:
 #ifdef CONFIG_PPC_BOOK3E
 	b	.exception_return_book3e
 #else
@@ -699,6 +695,25 @@ fast_exception_return:
 
 #endif /* CONFIG_PPC_BOOK3E */
 
+	/*
+	 * We are returning to a context with interrupts soft disabled.
+	 *
+	 * However, we may also about to hard enable, so we need to
+	 * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
+	 * or that bit can get out of sync and bad things will happen
+	 */
+restore_irq_off:
+	ld	r3,_MSR(r1)
+	lbz	r7,PACAIRQHAPPENED(r13)
+	andi.	r0,r3,MSR_EE
+	beq	1f
+	rlwinm	r7,r7,0,~PACA_IRQ_HARD_DIS
+	stb	r7,PACAIRQHAPPENED(r13)
+1:	li	r0,0
+	stb	r0,PACASOFTIRQEN(r13);
+	TRACE_DISABLE_INTS
+	b	do_restore
+
 	/*
 	 * Something did happen, check if a re-emit is needed
 	 * (this also clears paca->irq_happened)
@@ -748,6 +763,9 @@ restore_check_irq_replay:
 #endif /* CONFIG_PPC_BOOK3E */
 1:	b	.ret_from_except /* What else to do here ? */
  
+
+
+3:
 do_work:
 #ifdef CONFIG_PREEMPT
 	andi.	r0,r3,MSR_PR	/* Returning to user mode? */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index c6c6f3b7f8cd..641da9e868ce 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -229,6 +229,19 @@ notrace void arch_local_irq_restore(unsigned long en)
 	 */
 	if (unlikely(irq_happened != PACA_IRQ_HARD_DIS))
 		__hard_irq_disable();
+#ifdef CONFIG_TRACE_IRQFLAG
+	else {
+		/*
+		 * We should already be hard disabled here. We had bugs
+		 * where that wasn't the case so let's dbl check it and
+		 * warn if we are wrong. Only do that when IRQ tracing
+		 * is enabled as mfmsr() can be costly.
+		 */
+		if (WARN_ON(mfmsr() & MSR_EE))
+			__hard_irq_disable();
+	}
+#endif /* CONFIG_TRACE_IRQFLAG */
+
 	set_soft_enabled(0);
 
 	/*
-- 
cgit v1.2.3