From 05be18241e83d2ac6b656c8f924e74b3998c173f Mon Sep 17 00:00:00 2001 From: Jan Seiffert Date: Sun, 29 Apr 2012 19:02:19 +0000 Subject: bpf jit: Let the powerpc jit handle negative offsets Now the helper function from filter.c for negative offsets is exported, it can be used it in the jit to handle negative offsets. First modify the asm load helper functions to handle: - know positive offsets - know negative offsets - any offset then the compiler can be modified to explicitly use these helper when appropriate. This fixes the case of a negative X register and allows to lift the restriction that bpf programs with negative offsets can't be jited. Tested-by: Benjamin Herrenschmidt Signed-off-by: Jan Seiffert Signed-off-by: David S. Miller --- arch/powerpc/net/bpf_jit.h | 8 ++- arch/powerpc/net/bpf_jit_64.S | 108 +++++++++++++++++++++++++++++++++++----- arch/powerpc/net/bpf_jit_comp.c | 26 ++++------ 3 files changed, 111 insertions(+), 31 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index af1ab5e9a691..5c3cf2d04e41 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -48,7 +48,13 @@ /* * Assembly helpers from arch/powerpc/net/bpf_jit.S: */ -extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; +#define DECLARE_LOAD_FUNC(func) \ + extern u8 func[], func##_negative_offset[], func##_positive_offset[] + +DECLARE_LOAD_FUNC(sk_load_word); +DECLARE_LOAD_FUNC(sk_load_half); +DECLARE_LOAD_FUNC(sk_load_byte); +DECLARE_LOAD_FUNC(sk_load_byte_msh); #define FUNCTION_DESCR_SIZE 24 diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S index ff4506e85cce..55ba3855a97f 100644 --- a/arch/powerpc/net/bpf_jit_64.S +++ b/arch/powerpc/net/bpf_jit_64.S @@ -31,14 +31,13 @@ * then branch directly to slow_path_XXX if required. (In fact, could * load a spare GPR with the address of slow_path_generic and pass size * as an argument, making the call site a mtlr, li and bllr.) - * - * Technically, the "is addr < 0" check is unnecessary & slowing down - * the ABS path, as it's statically checked on generation. */ .globl sk_load_word sk_load_word: cmpdi r_addr, 0 - blt bpf_error + blt bpf_slow_path_word_neg + .globl sk_load_word_positive_offset +sk_load_word_positive_offset: /* Are we accessing past headlen? */ subi r_scratch1, r_HL, 4 cmpd r_scratch1, r_addr @@ -51,7 +50,9 @@ sk_load_word: .globl sk_load_half sk_load_half: cmpdi r_addr, 0 - blt bpf_error + blt bpf_slow_path_half_neg + .globl sk_load_half_positive_offset +sk_load_half_positive_offset: subi r_scratch1, r_HL, 2 cmpd r_scratch1, r_addr blt bpf_slow_path_half @@ -61,7 +62,9 @@ sk_load_half: .globl sk_load_byte sk_load_byte: cmpdi r_addr, 0 - blt bpf_error + blt bpf_slow_path_byte_neg + .globl sk_load_byte_positive_offset +sk_load_byte_positive_offset: cmpd r_HL, r_addr ble bpf_slow_path_byte lbzx r_A, r_D, r_addr @@ -69,22 +72,20 @@ sk_load_byte: /* * BPF_S_LDX_B_MSH: ldxb 4*([offset]&0xf) - * r_addr is the offset value, already known positive + * r_addr is the offset value */ .globl sk_load_byte_msh sk_load_byte_msh: + cmpdi r_addr, 0 + blt bpf_slow_path_byte_msh_neg + .globl sk_load_byte_msh_positive_offset +sk_load_byte_msh_positive_offset: cmpd r_HL, r_addr ble bpf_slow_path_byte_msh lbzx r_X, r_D, r_addr rlwinm r_X, r_X, 2, 32-4-2, 31-2 blr -bpf_error: - /* Entered with cr0 = lt */ - li r3, 0 - /* Generated code will 'blt epilogue', returning 0. */ - blr - /* Call out to skb_copy_bits: * We'll need to back up our volatile regs first; we have * local variable space at r1+(BPF_PPC_STACK_BASIC). @@ -136,3 +137,84 @@ bpf_slow_path_byte_msh: lbz r_X, BPF_PPC_STACK_BASIC+(2*8)(r1) rlwinm r_X, r_X, 2, 32-4-2, 31-2 blr + +/* Call out to bpf_internal_load_pointer_neg_helper: + * We'll need to back up our volatile regs first; we have + * local variable space at r1+(BPF_PPC_STACK_BASIC). + * Allocate a new stack frame here to remain ABI-compliant in + * stashing LR. + */ +#define sk_negative_common(SIZE) \ + mflr r0; \ + std r0, 16(r1); \ + /* R3 goes in parameter space of caller's frame */ \ + std r_skb, (BPF_PPC_STACKFRAME+48)(r1); \ + std r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1); \ + std r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1); \ + stdu r1, -BPF_PPC_SLOWPATH_FRAME(r1); \ + /* R3 = r_skb, as passed */ \ + mr r4, r_addr; \ + li r5, SIZE; \ + bl bpf_internal_load_pointer_neg_helper; \ + /* R3 != 0 on success */ \ + addi r1, r1, BPF_PPC_SLOWPATH_FRAME; \ + ld r0, 16(r1); \ + ld r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1); \ + ld r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1); \ + mtlr r0; \ + cmpldi r3, 0; \ + beq bpf_error_slow; /* cr0 = EQ */ \ + mr r_addr, r3; \ + ld r_skb, (BPF_PPC_STACKFRAME+48)(r1); \ + /* Great success! */ + +bpf_slow_path_word_neg: + lis r_scratch1,-32 /* SKF_LL_OFF */ + cmpd r_addr, r_scratch1 /* addr < SKF_* */ + blt bpf_error /* cr0 = LT */ + .globl sk_load_word_negative_offset +sk_load_word_negative_offset: + sk_negative_common(4) + lwz r_A, 0(r_addr) + blr + +bpf_slow_path_half_neg: + lis r_scratch1,-32 /* SKF_LL_OFF */ + cmpd r_addr, r_scratch1 /* addr < SKF_* */ + blt bpf_error /* cr0 = LT */ + .globl sk_load_half_negative_offset +sk_load_half_negative_offset: + sk_negative_common(2) + lhz r_A, 0(r_addr) + blr + +bpf_slow_path_byte_neg: + lis r_scratch1,-32 /* SKF_LL_OFF */ + cmpd r_addr, r_scratch1 /* addr < SKF_* */ + blt bpf_error /* cr0 = LT */ + .globl sk_load_byte_negative_offset +sk_load_byte_negative_offset: + sk_negative_common(1) + lbz r_A, 0(r_addr) + blr + +bpf_slow_path_byte_msh_neg: + lis r_scratch1,-32 /* SKF_LL_OFF */ + cmpd r_addr, r_scratch1 /* addr < SKF_* */ + blt bpf_error /* cr0 = LT */ + .globl sk_load_byte_msh_negative_offset +sk_load_byte_msh_negative_offset: + sk_negative_common(1) + lbz r_X, 0(r_addr) + rlwinm r_X, r_X, 2, 32-4-2, 31-2 + blr + +bpf_error_slow: + /* fabricate a cr0 = lt */ + li r_scratch1, -1 + cmpdi r_scratch1, 0 +bpf_error: + /* Entered with cr0 = lt */ + li r3, 0 + /* Generated code will 'blt epilogue', returning 0. */ + blr diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 73619d3aeb6c..2dc8b1484845 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -127,6 +127,9 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } +#define CHOOSE_LOAD_FUNC(K, func) \ + ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) + /* Assemble the body code between the prologue & epilogue. */ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, struct codegen_context *ctx, @@ -391,21 +394,16 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, /*** Absolute loads from packet header/data ***/ case BPF_S_LD_W_ABS: - func = sk_load_word; + func = CHOOSE_LOAD_FUNC(K, sk_load_word); goto common_load; case BPF_S_LD_H_ABS: - func = sk_load_half; + func = CHOOSE_LOAD_FUNC(K, sk_load_half); goto common_load; case BPF_S_LD_B_ABS: - func = sk_load_byte; + func = CHOOSE_LOAD_FUNC(K, sk_load_byte); common_load: - /* - * Load from [K]. Reference with the (negative) - * SKF_NET_OFF/SKF_LL_OFF offsets is unsupported. - */ + /* Load from [K]. */ ctx->seen |= SEEN_DATAREF; - if ((int)K < 0) - return -ENOTSUPP; PPC_LI64(r_scratch1, func); PPC_MTLR(r_scratch1); PPC_LI32(r_addr, K); @@ -429,7 +427,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, common_load_ind: /* * Load from [X + K]. Negative offsets are tested for - * in the helper functions, and result in a 'ret 0'. + * in the helper functions. */ ctx->seen |= SEEN_DATAREF | SEEN_XREG; PPC_LI64(r_scratch1, func); @@ -443,13 +441,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, break; case BPF_S_LDX_B_MSH: - /* - * x86 version drops packet (RET 0) when K<0, whereas - * interpreter does allow K<0 (__load_pointer, special - * ancillary data). common_load returns ENOTSUPP if K<0, - * so we fall back to interpreter & filter works. - */ - func = sk_load_byte_msh; + func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh); goto common_load; break; -- cgit v1.2.3 From de6c0b02d4d7bdf2587e679a6ddbb71b7d68bb89 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 8 May 2012 20:24:08 +1000 Subject: KVM: PPC: Book3S HV: Fix refcounting of hugepages The H_REGISTER_VPA hcall implementation in HV Power KVM needs to pin some guest memory pages into host memory so that they can be safely accessed from usermode. It does this used get_user_pages_fast(). When the VPA is unregistered, or the VCPUs are cleaned up, these pages are released using put_page(). However, the get_user_pages() is invoked on the specific memory are of the VPA which could lie within hugepages. In case the pinned page is huge, we explicitly find the head page of the compound page before calling put_page() on it. At least with the latest kernel, this is not correct. put_page() already handles finding the correct head page of a compound, and also deals with various counts on the individual tail page which are important for transparent huge pages. We don't support transparent hugepages on Power, but even so, bypassing this count maintenance can lead (when the VM ends) to a hugepage being released back to the pool with a non-zero mapcount on one of the tail pages. This can then lead to a bad_page() when the page is released from the hugepage pool. This removes the explicit compound_head() call to correct this bug. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras Acked-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 22 +++++++++++++--------- arch/powerpc/kvm/book3s_hv.c | 2 -- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ddc485a529f2..c3beaeef3f60 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -258,6 +258,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, !(memslot->userspace_addr & (s - 1))) { start &= ~(s - 1); pgsize = s; + get_page(hpage); + put_page(page); page = hpage; } } @@ -281,11 +283,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, err = 0; out: - if (got) { - if (PageHuge(page)) - page = compound_head(page); + if (got) put_page(page); - } return err; up_err: @@ -678,8 +677,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, SetPageDirty(page); out_put: - if (page) - put_page(page); + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } return ret; out_unlock: @@ -979,6 +985,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, pa = *physp; } page = pfn_to_page(pa >> PAGE_SHIFT); + get_page(page); } else { hva = gfn_to_hva_memslot(memslot, gfn); npages = get_user_pages_fast(hva, 1, 1, pages); @@ -991,8 +998,6 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, page = compound_head(page); psize <<= compound_order(page); } - if (!kvm->arch.using_mmu_notifiers) - get_page(page); offset = gpa & (psize - 1); if (nb_ret) *nb_ret = psize - offset; @@ -1003,7 +1008,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) { struct page *page = virt_to_page(va); - page = compound_head(page); put_page(page); } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 01294a5099dd..108d1f580177 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1192,8 +1192,6 @@ static void unpin_slot(struct kvm *kvm, int slot_id) continue; pfn = physp[j] >> PAGE_SHIFT; page = pfn_to_page(pfn); - if (PageHuge(page)) - page = compound_head(page); SetPageDirty(page); put_page(page); } -- cgit v1.2.3 From 7c0482e3d055e5de056d3c693b821e39205b99ae Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 May 2012 16:12:38 +0000 Subject: powerpc/irq: Fix another case of lazy IRQ state getting out of sync So we have another case of paca->irq_happened getting out of sync with the HW irq state. This can happen when a perfmon interrupt occurs while soft disabled, as it will return to a soft disabled but hard enabled context while leaving a stale PACA_IRQ_HARD_DIS flag set. This patch fixes it, and also adds a test for the condition of those flags being out of sync in arch_local_irq_restore() when CONFIG_TRACE_IRQFLAGS is enabled. This helps catching those gremlins faster (and so far I can't seem see any anymore, so that's good news). Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_64.S | 44 +++++++++++++++++++++++++++++------------- arch/powerpc/kernel/irq.c | 13 +++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fc6015027a86..ef2074c3e906 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -588,23 +588,19 @@ _GLOBAL(ret_from_except_lite) fast_exc_return_irq: restore: /* - * This is the main kernel exit path, we first check if we - * have to change our interrupt state. + * This is the main kernel exit path. First we check if we + * are about to re-enable interrupts */ ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) - cmpwi cr1,r5,0 - cmpw cr0,r5,r6 - beq cr0,4f + cmpwi cr0,r5,0 + beq restore_irq_off - /* We do, handle disable first, which is easy */ - bne cr1,3f; - li r0,0 - stb r0,PACASOFTIRQEN(r13); - TRACE_DISABLE_INTS - b 4f + /* We are enabling, were we already enabled ? Yes, just return */ + cmpwi cr0,r6,1 + beq cr0,do_restore -3: /* + /* * We are about to soft-enable interrupts (we are hard disabled * at this point). We check if there's anything that needs to * be replayed first. @@ -626,7 +622,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -4: +do_restore: #ifdef CONFIG_PPC_BOOK3E b .exception_return_book3e #else @@ -699,6 +695,25 @@ fast_exception_return: #endif /* CONFIG_PPC_BOOK3E */ + /* + * We are returning to a context with interrupts soft disabled. + * + * However, we may also about to hard enable, so we need to + * make sure that in this case, we also clear PACA_IRQ_HARD_DIS + * or that bit can get out of sync and bad things will happen + */ +restore_irq_off: + ld r3,_MSR(r1) + lbz r7,PACAIRQHAPPENED(r13) + andi. r0,r3,MSR_EE + beq 1f + rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS + stb r7,PACAIRQHAPPENED(r13) +1: li r0,0 + stb r0,PACASOFTIRQEN(r13); + TRACE_DISABLE_INTS + b do_restore + /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) @@ -748,6 +763,9 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_BOOK3E */ 1: b .ret_from_except /* What else to do here ? */ + + +3: do_work: #ifdef CONFIG_PREEMPT andi. r0,r3,MSR_PR /* Returning to user mode? */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c6c6f3b7f8cd..641da9e868ce 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -229,6 +229,19 @@ notrace void arch_local_irq_restore(unsigned long en) */ if (unlikely(irq_happened != PACA_IRQ_HARD_DIS)) __hard_irq_disable(); +#ifdef CONFIG_TRACE_IRQFLAG + else { + /* + * We should already be hard disabled here. We had bugs + * where that wasn't the case so let's dbl check it and + * warn if we are wrong. Only do that when IRQ tracing + * is enabled as mfmsr() can be costly. + */ + if (WARN_ON(mfmsr() & MSR_EE)) + __hard_irq_disable(); + } +#endif /* CONFIG_TRACE_IRQFLAG */ + set_soft_enabled(0); /* -- cgit v1.2.3