diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:00:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:00:24 -0800 |
commit | 974aa5630b318938273d7efe7a2cf031c7b927db (patch) | |
tree | b79803c07b9c16d87058ce69f80ebe173cdfd838 /arch/powerpc | |
parent | 441692aafc1731087bbaf657a8b6059d95c2a6df (diff) | |
parent | a6014f1ab7088dc02b58991cfb6b32a34afdbf12 (diff) |
Merge tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář:
"First batch of KVM changes for 4.15
Common:
- Python 3 support in kvm_stat
- Accounting of slabs to kmemcg
ARM:
- Optimized arch timer handling for KVM/ARM
- Improvements to the VGIC ITS code and introduction of an ITS reset
ioctl
- Unification of the 32-bit fault injection logic
- More exact external abort matching logic
PPC:
- Support for running hashed page table (HPT) MMU mode on a host that
is using the radix MMU mode; single threaded mode on POWER 9 is
added as a pre-requisite
- Resolution of merge conflicts with the last second 4.14 HPT fixes
- Fixes and cleanups
s390:
- Some initial preparation patches for exitless interrupts and crypto
- New capability for AIS migration
- Fixes
x86:
- Improved emulation of LAPIC timer mode changes, MCi_STATUS MSRs,
and after-reset state
- Refined dependencies for VMX features
- Fixes for nested SMI injection
- A lot of cleanups"
* tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (89 commits)
KVM: s390: provide a capability for AIS state migration
KVM: s390: clear_io_irq() requests are not expected for adapter interrupts
KVM: s390: abstract conversion between isc and enum irq_types
KVM: s390: vsie: use common code functions for pinning
KVM: s390: SIE considerations for AP Queue virtualization
KVM: s390: document memory ordering for kvm_s390_vcpu_wakeup
KVM: PPC: Book3S HV: Cosmetic post-merge cleanups
KVM: arm/arm64: fix the incompatible matching for external abort
KVM: arm/arm64: Unify 32bit fault injection
KVM: arm/arm64: vgic-its: Implement KVM_DEV_ARM_ITS_CTRL_RESET
KVM: arm/arm64: Document KVM_DEV_ARM_ITS_CTRL_RESET
KVM: arm/arm64: vgic-its: Free caches when GITS_BASER Valid bit is cleared
KVM: arm/arm64: vgic-its: New helper functions to free the caches
KVM: arm/arm64: vgic-its: Remove kvm_its_unmap_device
arm/arm64: KVM: Load the timer state when enabling the timer
KVM: arm/arm64: Rework kvm_timer_should_fire
KVM: arm/arm64: Get rid of kvm_timer_flush_hwstate
KVM: arm/arm64: Avoid phys timer emulation in vcpu entry/exit
KVM: arm/arm64: Move phys_timer_emulate function
KVM: arm/arm64: Use kvm_arm_timer_set/get_reg for guest register traps
...
Diffstat (limited to 'arch/powerpc')
-rw-r--r-- | arch/powerpc/include/asm/kvm_book3s.h | 3 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_book3s_64.h | 140 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_book3s_asm.h | 13 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_host.h | 6 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_ppc.h | 3 | ||||
-rw-r--r-- | arch/powerpc/kernel/asm-offsets.c | 3 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_hv.c | 128 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_radix.c | 51 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_slb.S | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 347 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_builtin.c | 117 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_mmu.c | 65 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 197 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr.c | 16 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr_papr.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500_mmu_host.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/powerpc.c | 3 |
17 files changed, 800 insertions, 298 deletions
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index b8d5b8e35244..9a667007bff8 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); -extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); +extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned long psize); extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index); void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d55c7f881ce7..735cfa35298a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ +#include <linux/string.h> +#include <asm/bitops.h> #include <asm/book3s/64/mmu-hash.h> /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ @@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) hpte[0] = cpu_to_be64(hpte_v); } +/* + * These functions encode knowledge of the POWER7/8/9 hardware + * interpretations of the HPTE LP (large page size) field. + */ +static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) +{ + unsigned int lphi; + + if (!(h & HPTE_V_LARGE)) + return 12; /* 4kB */ + lphi = (l >> 16) & 0xf; + switch ((l >> 12) & 0xf) { + case 0: + return !lphi ? 24 : -1; /* 16MB */ + break; + case 1: + return 16; /* 64kB */ + break; + case 3: + return !lphi ? 34 : -1; /* 16GB */ + break; + case 7: + return (16 << 8) + 12; /* 64kB in 4kB */ + break; + case 8: + if (!lphi) + return (24 << 8) + 16; /* 16MB in 64kkB */ + if (lphi == 3) + return (24 << 8) + 12; /* 16MB in 4kB */ + break; + } + return -1; +} + +static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) +{ + return kvmppc_hpte_page_shifts(h, l) & 0xff; +} + +static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l) +{ + int tmp = kvmppc_hpte_page_shifts(h, l); + + if (tmp >= 0x100) + tmp >>= 8; + return tmp; +} + +static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) +{ + return 1ul << kvmppc_hpte_actual_page_shift(v, r); +} + +static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) +{ + switch (base_shift) { + case 12: + switch (actual_shift) { + case 12: + return 0; + case 16: + return 7; + case 24: + return 0x38; + } + break; + case 16: + switch (actual_shift) { + case 16: + return 1; + case 24: + return 8; + } + break; + case 24: + return 0; + } + return -1; +} + static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { - int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; - unsigned int penc; + int a_pgshift, b_pgshift; unsigned long rb = 0, va_low, sllp; - unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); - if (v & HPTE_V_LARGE) { - i = hpte_page_sizes[lp]; - b_psize = i & 0xf; - a_psize = i >> 4; + b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r); + if (a_pgshift >= 0x100) { + b_pgshift &= 0xff; + a_pgshift >>= 8; } /* @@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, va_low ^= v >> (SID_SHIFT_1T - 16); va_low &= 0x7ff; - switch (b_psize) { - case MMU_PAGE_4K: - sllp = get_sllp_encoding(a_psize); - rb |= sllp << 5; /* AP field */ + if (b_pgshift == 12) { + if (a_pgshift > 12) { + sllp = (a_pgshift == 16) ? 5 : 4; + rb |= sllp << 5; /* AP field */ + } rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ - break; - default: - { + } else { int aval_shift; /* * remaining bits of AVA/LP fields * Also contain the rr bits of LP */ - rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000; + rb |= (va_low << b_pgshift) & 0x7ff000; /* * Now clear not needed LP bits based on actual psize */ - rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1); + rb &= ~((1ul << a_pgshift) - 1); /* * AVAL field 58..77 - base_page_shift bits of va * we have space for 58..64 bits, Missing bits should * be zero filled. +1 is to take care of L bit shift */ - aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1; + aval_shift = 64 - (77 - b_pgshift) + 1; rb |= ((va_low << aval_shift) & 0xfe); rb |= 1; /* L field */ - penc = mmu_psize_defs[b_psize].penc[a_psize]; - rb |= penc << 12; /* LP field */ - break; - } + rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */ } rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ return rb; @@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt) return (1UL << (hpt->order - 7)) - 1; } +/* Set bits in a dirty bitmap, which is in LE format */ +static inline void set_dirty_bits(unsigned long *map, unsigned long i, + unsigned long npages) +{ + + if (npages >= 8) + memset((char *)map + i / 8, 0xff, npages / 8); + else + for (; npages; ++i, --npages) + __set_bit_le(i, map); +} + +static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i, + unsigned long npages) +{ + if (npages >= 8) + memset((char *)map + i / 8, 0xff, npages / 8); + else + for (; npages; ++i, --npages) + set_bit_le(i, map); +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 7cea76f11c26..ab386af2904f 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -82,6 +82,16 @@ struct kvm_split_mode { u8 do_nap; u8 napped[MAX_SMT_THREADS]; struct kvmppc_vcore *vc[MAX_SUBCORES]; + /* Bits for changing lpcr on P9 */ + unsigned long lpcr_req; + unsigned long lpidr_req; + unsigned long host_lpcr; + u32 do_set; + u32 do_restore; + union { + u32 allphases; + u8 phase[4]; + } lpcr_sync; }; /* @@ -107,7 +117,8 @@ struct kvmppc_host_state { u8 hwthread_req; u8 hwthread_state; u8 host_ipi; - u8 ptid; + u8 ptid; /* thread number within subcore when split */ + u8 tid; /* thread number within whole core */ struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; void __iomem *xics_phys; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e372ed871c51..3aa5b577cd60 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -235,10 +235,7 @@ struct revmap_entry { */ #define KVMPPC_RMAP_LOCK_BIT 63 #define KVMPPC_RMAP_RC_SHIFT 32 -#define KVMPPC_RMAP_CHG_SHIFT 48 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) -#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT) -#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul #define KVMPPC_RMAP_INDEX 0xfffffffful @@ -276,7 +273,7 @@ struct kvm_arch { int tlbie_lock; unsigned long lpcr; unsigned long vrma_slb_v; - int hpte_setup_done; + int mmu_ready; atomic_t vcpus_running; u32 online_vcores; atomic_t hpte_mod_interest; @@ -284,6 +281,7 @@ struct kvm_arch { cpumask_t cpu_in_guest; u8 radix; u8 fwnmi_enabled; + bool threads_indep; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ba5fadd6f3c9..96753f3aac6d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); +extern void kvmppc_rmap_reset(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, @@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, struct iommu_group *grp); extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, struct iommu_group *grp); +extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm); +extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9aace433491a..6b958414b4e0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -642,6 +642,7 @@ int main(void) HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); + HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); @@ -667,6 +668,8 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); + OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); + OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 59247af5fd45..235319c2574e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -73,8 +73,6 @@ struct kvm_resize_hpt { struct kvm_hpt_info hpt; }; -static void kvmppc_rmap_reset(struct kvm *kvm); - int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; @@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); if (cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else @@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) long err = -EBUSY; struct kvm_hpt_info info; - if (kvm_is_radix(kvm)) - return -EINVAL; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) { - kvm->arch.hpte_setup_done = 0; - /* order hpte_setup_done vs. vcpus_running */ + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; goto out; } } + if (kvm_is_radix(kvm)) { + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out; + } + if (kvm->arch.hpt.order == order) { /* We already have a suitable HPT */ @@ -183,6 +183,7 @@ out: void kvmppc_free_hpt(struct kvm_hpt_info *info) { vfree(info->rev); + info->rev = NULL; if (info->cma) kvm_free_hpt_cma(virt_to_page(info->virt), 1 << (info->order - PAGE_SHIFT)); @@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, { unsigned long ra_mask; - ra_mask = hpte_page_size(v, r) - 1; + ra_mask = kvmppc_actual_pgsz(v, r) - 1; return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); } @@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + if (kvm_is_radix(vcpu->kvm)) + return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); + /* Get SLB entry */ if (virtmode) { slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); @@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, mmio_update = atomic64_read(&kvm->arch.mmio_update); if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { r = vcpu->arch.pgfault_cache->rpte; - psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); + psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], + r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; /* Translate the logical address and get the page */ - psize = hpte_page_size(hpte[0], r); + psize = kvmppc_actual_pgsz(hpte[0], r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -650,10 +655,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* * If the HPT is being resized, don't update the HPTE, * instead let the guest retry after the resize operation is complete. - * The synchronization for hpte_setup_done test vs. set is provided + * The synchronization for mmu_ready test vs. set is provided * by the HPTE lock. */ - if (!kvm->arch.hpte_setup_done) + if (!kvm->arch.mmu_ready) goto out_unlock; if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || @@ -720,7 +725,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } -static void kvmppc_rmap_reset(struct kvm *kvm) +void kvmppc_rmap_reset(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -786,6 +791,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + struct kvm_memory_slot *memslot, unsigned long *rmapp, unsigned long gfn) { __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); @@ -808,7 +814,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); @@ -817,8 +823,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); + if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; note_hpte_modification(kvm, &rev[i]); @@ -856,7 +862,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, continue; } - kvmppc_unmap_hpte(kvm, i, rmapp, gfn); + kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } @@ -1039,14 +1045,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) retry: lock_rmap(rmapp); - if (*rmapp & KVMPPC_RMAP_CHANGED) { - long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) - >> KVMPPC_RMAP_CHG_SHIFT; - *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); - npages_dirty = 1; - if (change_order > PAGE_SHIFT) - npages_dirty = 1ul << (change_order - PAGE_SHIFT); - } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); return npages_dirty; @@ -1102,7 +1100,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); } - n = hpte_page_size(v, r); + n = kvmppc_actual_pgsz(v, r); n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; if (n > npages_dirty) npages_dirty = n; @@ -1138,7 +1136,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { - unsigned long i, j; + unsigned long i; unsigned long *rmapp; preempt_disable(); @@ -1150,9 +1148,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, * since we always put huge-page HPTEs in the rmap chain * corresponding to their page base address. */ - if (npages && map) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) + set_dirty_bits(map, i, npages); ++rmapp; } preempt_enable(); @@ -1196,7 +1193,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, struct page *page = virt_to_page(va); struct kvm_memory_slot *memslot; unsigned long gfn; - unsigned long *rmap; int srcu_idx; put_page(page); @@ -1204,20 +1200,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, if (!dirty) return; - /* We need to mark this page dirty in the rmap chain */ + /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ gfn = gpa >> PAGE_SHIFT; srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); - if (memslot) { - if (!kvm_is_radix(kvm)) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); - } else if (memslot->dirty_bitmap) { - mark_page_dirty(kvm, gfn); - } - } + if (memslot && memslot->dirty_bitmap) + set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); srcu_read_unlock(&kvm->srcu, srcu_idx); } @@ -1277,7 +1265,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, guest_rpte = rev->guest_rpte; ret = -EIO; - apsize = hpte_page_size(vpte, guest_rpte); + apsize = kvmppc_actual_pgsz(vpte, guest_rpte); if (!apsize) goto out; @@ -1292,7 +1280,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; lock_rmap(rmapp); - kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); unlock_rmap(rmapp); } @@ -1465,7 +1453,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, struct kvm_resize_hpt *resize; int ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1531,7 +1519,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, struct kvm_resize_hpt *resize; long ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1543,15 +1531,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, /* This shouldn't be possible */ ret = -EIO; - if (WARN_ON(!kvm->arch.hpte_setup_done)) + if (WARN_ON(!kvm->arch.mmu_ready)) goto out_no_hpt; /* Stop VCPUs from running while we mess with the HPT */ - kvm->arch.hpte_setup_done = 0; + kvm->arch.mmu_ready = 0; smp_mb(); /* Boot all CPUs out of the guest so they re-read - * hpte_setup_done */ + * mmu_ready */ on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); ret = -ENXIO; @@ -1574,7 +1562,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, out: /* Let VCPUs run again */ - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; smp_mb(); out_no_hpt: resize_hpt_release(kvm, resize); @@ -1717,6 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return 0; first_pass = ctx->first_pass; flags = ctx->flags; @@ -1810,20 +1800,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, unsigned long tmp[2]; ssize_t nb; long int err, ret; - int hpte_setup; + int mmu_ready; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return -EINVAL; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - hpte_setup = kvm->arch.hpte_setup_done; - if (hpte_setup) { - kvm->arch.hpte_setup_done = 0; /* temporarily */ - /* order hpte_setup_done vs. vcpus_running */ + mmu_ready = kvm->arch.mmu_ready; + if (mmu_ready) { + kvm->arch.mmu_ready = 0; /* temporarily */ + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@ -1876,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, "r=%lx\n", ret, i, v, r); goto out; } - if (!hpte_setup && is_vrma_hpte(v)) { + if (!mmu_ready && is_vrma_hpte(v)) { unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; @@ -1885,7 +1877,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - hpte_setup = 1; + mmu_ready = 1; } ++i; hptp += 2; @@ -1901,9 +1893,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, } out: - /* Order HPTE updates vs. hpte_setup_done */ + /* Order HPTE updates vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = hpte_setup; + kvm->arch.mmu_ready = mmu_ready; mutex_unlock(&kvm->lock); if (err) @@ -2012,6 +2004,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, struct kvm *kvm; __be64 *hptp; + kvm = p->kvm; + if (kvm_is_radix(kvm)) + return 0; + ret = mutex_lock_interruptible(&p->mutex); if (ret) return ret; @@ -2034,7 +2030,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, } } - kvm = p->kvm; i = p->hpt_index; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); @@ -2109,10 +2104,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - if (kvm_is_radix(vcpu->kvm)) - mmu->xlate = kvmppc_mmu_radix_xlate; - else - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index c5d7435455f1..58618f644c56 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } -static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn, unsigned int order) -{ - unsigned long i, limit; - unsigned long *dp; - - if (!memslot->dirty_bitmap) - return; - limit = 1ul << order; - if (limit < BITS_PER_LONG) { - for (i = 0; i < limit; ++i) - mark_page_dirty(kvm, gfn + i); - return; - } - dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); - limit /= BITS_PER_LONG; - for (i = 0; i < limit; ++i) - *dp++ = ~0ul; -} - /* Called with kvm->lock held */ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) @@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - if (!shift) - mark_page_dirty(kvm, gfn); - else - mark_pages_dirty(kvm, memslot, - gfn, shift - PAGE_SHIFT); + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { + unsigned long npages = 1; + if (shift) + npages = 1ul << (shift - PAGE_SHIFT); + kvmppc_update_dirty_map(memslot, gfn, npages); } } return 0; @@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; - unsigned long n, *p; int npages; - /* - * Radix accumulates dirty bits in the first half of the - * memslot's dirty_bitmap area, for when pages are paged - * out or modified by the host directly. Pick up these - * bits and add them to the map. - */ - n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); - p = memslot->dirty_bitmap; - for (i = 0; i < n; ++i) - map[i] |= xchg(&p[i], 0); - for (i = 0; i < memslot->npages; i = j) { npages = kvm_radix_test_clear_dirty(kvm, memslot, i); @@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, * real address, if npages > 1 we can skip to i + npages. */ j = i + 1; - if (npages) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) { + set_dirty_bits(map, i, npages); + i = j + npages; + } } return 0; } @@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm) pgd_clear(pgd); } pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; } static void pte_ctor(void *addr) diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 3589c4e3d49b..688722acd692 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -113,7 +113,7 @@ slb_do_enter: /* Remove all SLB entries that are in use. */ - li r0, r0 + li r0, 0 slbmte r0, r0 slbia diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 40e5857c4b1c..79ea3d9269db 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -19,6 +19,7 @@ */ #include <linux/kvm_host.h> +#include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/preempt.h> @@ -98,6 +99,10 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +static bool indep_threads_mode = true; +module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -115,6 +120,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static void kvmppc_setup_partition_table(struct kvm *kvm); static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, int *ip) @@ -1734,9 +1740,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, * MMU mode (radix or HPT), unfortunately, but since we only support * HPT guests on a HPT host so far, that isn't an impediment yet. */ -static int threads_per_vcore(void) +static int threads_per_vcore(struct kvm *kvm) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm->arch.threads_indep) return 1; return threads_per_subcore; } @@ -1774,7 +1780,7 @@ static struct debugfs_timings_element { {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, }; -#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) +#define N_TIMINGS (ARRAY_SIZE(timings)) struct debugfs_timings_state { struct kvm_vcpu *vcpu; @@ -2228,11 +2234,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) kvmppc_ipi_thread(cpu); } -static void kvmppc_wait_for_nap(void) +static void kvmppc_wait_for_nap(int n_threads) { int cpu = smp_processor_id(); int i, loops; - int n_threads = threads_per_vcore(); if (n_threads <= 1) return; @@ -2319,7 +2324,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); - if (vc->num_threads < threads_per_vcore()) { + if (vc->num_threads < threads_per_vcore(vc->kvm)) { spin_lock(&lp->lock); list_add_tail(&vc->preempt_list, &lp->list); spin_unlock(&lp->lock); @@ -2357,7 +2362,7 @@ struct core_info { /* * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 - * respectively in 2-way micro-threading (split-core) mode. + * respectively in 2-way micro-threading (split-core) mode on POWER8. */ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; @@ -2373,7 +2378,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) static bool subcore_config_ok(int n_subcores, int n_threads) { - /* Can only dynamically split if unsplit to begin with */ + /* + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core + * mode, with one thread per subcore. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return n_subcores <= 4 && n_threads == 1; + + /* On POWER8, can only dynamically split if unsplit to begin with */ if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) return false; if (n_subcores > MAX_SUBCORES) @@ -2404,6 +2416,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* POWER9 currently requires all threads to be in the same MMU mode */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && + kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) + return false; + if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) @@ -2632,6 +2649,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int target_threads; int controlled_threads; int trap; + bool is_power8; + bool hpt_on_radix; /* * Remove from the list any threads that have a signal pending @@ -2654,15 +2673,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the number of threads per subcore, except on POWER9, * where it's 1 because the threads are (mostly) independent. */ - controlled_threads = threads_per_vcore(); + controlled_threads = threads_per_vcore(vc->kvm); /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. + * On POWER9, we need to be not in independent-threads mode if + * this is a HPT guest on a radix host. */ - if ((controlled_threads > 1) && - ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { + hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); + if (((controlled_threads > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || + (hpt_on_radix && vc->kvm->arch.threads_indep)) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); @@ -2699,14 +2722,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up * and return without going into the guest(s). - * If the hpte_setup_done flag has been cleared, don't go into the + * If the mmu_ready flag has been cleared, don't go into the * guest because that means a HPT resize operation is in progress. */ local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || - (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) { + recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -2728,32 +2750,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cmd_bit = stat_bit = 0; split = core_info.n_subcores; sip = NULL; - if (split > 1) { - /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ - if (split == 2 && (dynamic_mt_modes & 2)) { - cmd_bit = HID0_POWER8_1TO2LPAR; - stat_bit = HID0_POWER8_2LPARMODE; - } else { - split = 4; - cmd_bit = HID0_POWER8_1TO4LPAR; - stat_bit = HID0_POWER8_4LPARMODE; - } - subcore_size = MAX_SMT_THREADS / split; + is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) + && !cpu_has_feature(CPU_FTR_ARCH_300); + + if (split > 1 || hpt_on_radix) { sip = &split_info; memset(&split_info, 0, sizeof(split_info)); - split_info.rpr = mfspr(SPRN_RPR); - split_info.pmmar = mfspr(SPRN_PMMAR); - split_info.ldbar = mfspr(SPRN_LDBAR); - split_info.subcore_size = subcore_size; for (sub = 0; sub < core_info.n_subcores; ++sub) split_info.vc[sub] = core_info.vc[sub]; + + if (is_power8) { + if (split == 2 && (dynamic_mt_modes & 2)) { + cmd_bit = HID0_POWER8_1TO2LPAR; + stat_bit = HID0_POWER8_2LPARMODE; + } else { + split = 4; + cmd_bit = HID0_POWER8_1TO4LPAR; + stat_bit = HID0_POWER8_4LPARMODE; + } + subcore_size = MAX_SMT_THREADS / split; + split_info.rpr = mfspr(SPRN_RPR); + split_info.pmmar = mfspr(SPRN_PMMAR); + split_info.ldbar = mfspr(SPRN_LDBAR); + split_info.subcore_size = subcore_size; + } else { + split_info.subcore_size = 1; + if (hpt_on_radix) { + /* Use the split_info for LPCR/LPIDR changes */ + split_info.lpcr_req = vc->lpcr; + split_info.lpidr_req = vc->kvm->arch.lpid; + split_info.host_lpcr = vc->kvm->arch.host_lpcr; + split_info.do_set = 1; + } + } + /* order writes to split_info before kvm_split_mode pointer */ smp_wmb(); } - for (thr = 0; thr < controlled_threads; ++thr) + + for (thr = 0; thr < controlled_threads; ++thr) { + paca[pcpu + thr].kvm_hstate.tid = thr; + paca[pcpu + thr].kvm_hstate.napping = 0; paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; + } - /* Initiate micro-threading (split-core) if required */ + /* Initiate micro-threading (split-core) on POWER8 if required */ if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); @@ -2772,7 +2813,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { - thr = subcore_thread_map[sub]; + thr = is_power8 ? subcore_thread_map[sub] : sub; thr0_done = false; active |= 1 << thr; pvc = core_info.vc[sub]; @@ -2799,18 +2840,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the vcore pointer in the PACA of the secondaries. */ smp_mb(); - if (cmd_bit) - split_info.do_nap = 1; /* ask secondaries to nap when done */ /* * When doing micro-threading, poke the inactive threads as well. * This gets them to the nap instruction after kvm_do_nap, * which reduces the time taken to unsplit later. + * For POWER9 HPT guest on radix host, we need all the secondary + * threads woken up so they can do the LPCR/LPIDR change. */ - if (split > 1) + if (cmd_bit || hpt_on_radix) { + split_info.do_nap = 1; /* ask secondaries to nap when done */ for (thr = 1; thr < threads_per_subcore; ++thr) if (!(active & (1 << thr))) kvmppc_ipi_thread(pcpu + thr); + } vc->vcore_state = VCORE_RUNNING; preempt_disable(); @@ -2844,10 +2887,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_EXITING; /* wait for secondary threads to finish writing their state to memory */ - kvmppc_wait_for_nap(); + kvmppc_wait_for_nap(controlled_threads); /* Return to whole-core mode if we split the core earlier */ - if (split > 1) { + if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); unsigned long loops = 0; @@ -2863,8 +2906,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpu_relax(); ++loops; } - split_info.do_nap = 0; + } else if (hpt_on_radix) { + /* Wait for all threads to have seen final sync */ + for (thr = 1; thr < controlled_threads; ++thr) { + while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) { + HMT_low(); + barrier(); + } + HMT_medium(); + } } + split_info.do_nap = 0; kvmppc_set_host_core(pcpu); @@ -3073,6 +3125,25 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) +{ + int r = 0; + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->lock); + if (!kvm->arch.mmu_ready) { + if (!kvm_is_radix(kvm)) + r = kvmppc_hv_setup_htab_rma(vcpu); + if (!r) { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvmppc_setup_partition_table(kvm); + kvm->arch.mmu_ready = 1; + } + } + mutex_unlock(&kvm->lock); + return r; +} + static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded, i, r; @@ -3129,15 +3200,15 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - /* See if the HPT and VRMA are ready to go */ - if (!kvm_is_radix(vcpu->kvm) && - !vcpu->kvm->arch.hpte_setup_done) { + /* See if the MMU is ready to go */ + if (!vcpu->kvm->arch.mmu_ready) { spin_unlock(&vc->lock); - r = kvmppc_hv_setup_htab_rma(vcpu); + r = kvmhv_setup_mmu(vcpu); spin_lock(&vc->lock); if (r) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason = 0; + kvm_run->fail_entry. + hardware_entry_failure_reason = 0; vcpu->arch.ret = r; break; } @@ -3219,6 +3290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) unsigned long ebb_regs[3] = {}; /* shut up GCC */ unsigned long user_tar = 0; unsigned int user_vrsave; + struct kvm *kvm; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3256,8 +3328,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINTR; } - atomic_inc(&vcpu->kvm->arch.vcpus_running); - /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ + kvm = vcpu->kvm; + atomic_inc(&kvm->arch.vcpus_running); + /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ smp_mb(); flush_all_to_thread(current); @@ -3285,10 +3358,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) trace_kvm_hcall_exit(vcpu, r); kvmppc_core_prepare_to_enter(vcpu); } else if (r == RESUME_PAGE_FAULT) { - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + srcu_idx = srcu_read_lock(&kvm->srcu); r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { if (WARN_ON(xive_enabled())) r = H_SUCCESS; @@ -3308,27 +3381,26 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) mtspr(SPRN_VRSAVE, user_vrsave); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; - atomic_dec(&vcpu->kvm->arch.vcpus_running); + atomic_dec(&kvm->arch.vcpus_running); return r; } static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, - int linux_psize) + int shift, int sllp) { - struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; - - if (!def->shift) - return; - (*sps)->page_shift = def->shift; - (*sps)->slb_enc = def->sllp; - (*sps)->enc[0].page_shift = def->shift; - (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + (*sps)->page_shift = shift; + (*sps)->slb_enc = sllp; + (*sps)->enc[0].page_shift = shift; + (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift); /* - * Add 16MB MPSS support if host supports it + * Add 16MB MPSS support (may get filtered out by userspace) */ - if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { - (*sps)->enc[1].page_shift = 24; - (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + if (shift != 24) { + int penc = kvmppc_pgsize_lp_encoding(shift, 24); + if (penc != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = penc; + } } (*sps)++; } @@ -3339,13 +3411,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, struct kvm_ppc_one_seg_page_size *sps; /* - * Since we don't yet support HPT guests on a radix host, - * return an error if the host uses radix. - */ - if (radix_enabled()) - return -EINVAL; - - /* * POWER7, POWER8 and POWER9 all support 32 storage keys for data. * POWER7 doesn't support keys for instruction accesses, * POWER8 and POWER9 do. @@ -3353,16 +3418,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, info->data_keys = 32; info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; - info->flags = KVM_PPC_PAGE_SIZES_REAL; - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) - info->flags |= KVM_PPC_1T_SEGMENTS; - info->slb_size = mmu_slb_size; + /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */ + info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS; + info->slb_size = 32; /* We only support these sizes for now, and no muti-size segments */ sps = &info->sps[0]; - kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + kvmppc_add_seg_page_size(&sps, 12, 0); + kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); + kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); return 0; } @@ -3377,7 +3441,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_memory_slot *memslot; int i, r; unsigned long n; - unsigned long *buf; + unsigned long *buf, *p; struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -3393,8 +3457,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, goto out; /* - * Use second half of bitmap area because radix accumulates - * bits in the first half. + * Use second half of bitmap area because both HPT and radix + * accumulate bits in the first half. */ n = kvm_dirty_bitmap_bytes(memslot); buf = memslot->dirty_bitmap + n / sizeof(long); @@ -3407,6 +3471,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (r) goto out; + /* + * We accumulate dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + p = memslot->dirty_bitmap; + for (i = 0; i < n / sizeof(long); ++i) + buf[i] |= xchg(&p[i], 0); + /* Harvest dirty bits from VPA and DTL updates */ /* Note: we never modify the SLB shadow buffer areas */ kvm_for_each_vcpu(i, vcpu, kvm) { @@ -3438,15 +3512,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { - /* - * For now, if radix_enabled() then we only support radix guests, - * and in that case we don't need the rmap array. - */ - if (radix_enabled()) { - slot->arch.rmap = NULL; - return 0; - } - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3467,8 +3532,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, const struct kvm_memory_slot *new) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; /* * If we are making a new memslot, it might make @@ -3478,18 +3541,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ if (npages) atomic64_inc(&kvm->arch.mmio_update); - - if (npages && old->npages && !kvm_is_radix(kvm)) { - /* - * If modifying a memslot, reset all the rmap dirty bits. - * If this is a new memslot, we don't need to do anything - * since the rmap array starts out as all zeroes, - * i.e. no pages are dirty. - */ - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); - } } /* @@ -3545,6 +3596,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } +/* + * Set up HPT (hashed page table) and RMA (real-mode area). + * Must be called with kvm->lock held. + */ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -3556,10 +3611,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long psize, porder; int srcu_idx; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) - goto out; /* another vcpu beat us to it */ - /* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { int order = KVM_DEFAULT_HPT_ORDER; @@ -3618,18 +3669,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* the -4 is to account for senc values starting at 0x10 */ lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - } else { - kvmppc_setup_partition_table(kvm); } - /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ + /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = 1; err = 0; out_srcu: srcu_read_unlock(&kvm->srcu, srcu_idx); out: - mutex_unlock(&kvm->lock); return err; up_out: @@ -3637,6 +3684,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) +{ + kvmppc_free_radix(kvm); + kvmppc_update_lpcr(kvm, LPCR_VPM1, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); + kvm->arch.radix = 0; + kvm->arch.process_table = 0; + return 0; +} + +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_radix(struct kvm *kvm) +{ + int err; + + err = kvmppc_init_vm_radix(kvm); + if (err) + return err; + + kvmppc_free_hpt(&kvm->arch.hpt); + kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvm->arch.radix = 1; + return 0; +} + #ifdef CONFIG_KVM_XICS /* * Allocate a per-core structure for managing state about which cores are @@ -3780,10 +3855,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) } /* - * For now, if the host uses radix, the guest must be radix. + * If the host uses radix, the guest starts out as radix. */ if (radix_enabled()) { kvm->arch.radix = 1; + kvm->arch.mmu_ready = 1; lpcr &= ~LPCR_VPM1; lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; ret = kvmppc_init_vm_radix(kvm); @@ -3803,7 +3879,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (kvm_is_radix(kvm)) + if (radix_enabled()) kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ @@ -3815,10 +3891,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. - * On POWER9, we only need to do this for HPT guests on a radix - * host, which is not yet supported. + * On POWER9, we only need to do this if the "indep_threads_mode" + * module parameter has been set to N. */ - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.threads_indep = indep_threads_mode; + if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); /* @@ -3858,7 +3936,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (!kvm->arch.threads_indep) kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); @@ -4193,6 +4271,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) { unsigned long lpcr; int radix; + int err; /* If not on a POWER9, reject it */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) @@ -4202,12 +4281,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) return -EINVAL; - /* We can't change a guest to/from radix yet */ - radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); - if (radix != kvm_is_radix(kvm)) - return -EINVAL; - /* GR (guest radix) bit in process_table field must match */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); if (!!(cfg->process_table & PATB_GR) != radix) return -EINVAL; @@ -4215,15 +4290,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + /* We can change a guest to/from radix now, if the host is radix */ + if (radix && !radix_enabled()) + return -EINVAL; + mutex_lock(&kvm->lock); + if (radix != kvm_is_radix(kvm)) { + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.mmu_ready = 1; + err = -EBUSY; + goto out_unlock; + } + } + if (radix) + err = kvmppc_switch_mmu_to_radix(kvm); + else + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out_unlock; + } + kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); - mutex_unlock(&kvm->lock); + err = 0; - return 0; + out_unlock: + mutex_unlock(&kvm->lock); + return err; } static struct kvmppc_ops kvm_ops_hv = { @@ -4365,4 +4465,3 @@ module_exit(kvmppc_book3s_exit_hv); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); - diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 90644db9d38e..49a2c7825e04 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap) struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; int ptid = local_paca->kvm_hstate.ptid; struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; - int me, ee, i; + int me, ee, i, t; + int cpu0; /* Set our bit in the threads-exiting-guest map in the 0xff00 bits of vcore->entry_exit_map */ @@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap) if ((ee >> 8) == 0) kvmhv_interrupt_vcore(vc, ee); } + + /* + * On POWER9 when running a HPT guest on a radix host (sip != NULL), + * we have to interrupt inactive CPU threads to get them to + * restore the host LPCR value. + */ + if (sip->lpcr_req) { + if (cmpxchg(&sip->do_restore, 0, 1) == 0) { + vc = local_paca->kvm_hstate.kvm_vcore; + cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid; + for (t = 1; t < threads_per_core; ++t) { + if (sip->napped[t]) + kvmhv_rm_send_ipi(cpu0 + t); + } + } + } } struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; @@ -529,6 +546,8 @@ static inline bool is_rm(void) unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_xirr(vcpu); @@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; vcpu->arch.gpr[5] = get_tb(); if (xive_enabled()) { if (is_rm()) @@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); @@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); @@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); @@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); @@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return xics_rm_h_eoi(vcpu, xirr); } #endif /* CONFIG_KVM_XICS */ + +void kvmppc_bad_interrupt(struct pt_regs *regs) +{ + die("Bad interrupt in KVM entry/exit code", regs, SIGABRT); + panic("Bad KVM trap"); +} + +/* + * Functions used to switch LPCR HR and UPRT bits on all threads + * when entering and exiting HPT guests on a radix host. + */ + +#define PHASE_REALMODE 1 /* in real mode */ +#define PHASE_SET_LPCR 2 /* have set LPCR */ +#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */ +#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */ + +#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p)) + +static void wait_for_sync(struct kvm_split_mode *sip, int phase) +{ + int thr = local_paca->kvm_hstate.tid; + + sip->lpcr_sync.phase[thr] |= phase; + phase = ALL(phase); + while ((sip->lpcr_sync.allphases & phase) != phase) { + HMT_low(); + barrier(); + } + HMT_medium(); +} + +void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip) +{ + unsigned long rb, set; + + /* wait for every other thread to get to real mode */ + wait_for_sync(sip, PHASE_REALMODE); + + /* Set LPCR and LPIDR */ + mtspr(SPRN_LPCR, sip->lpcr_req); + mtspr(SPRN_LPID, sip->lpidr_req); + isync(); + + /* Invalidate the TLB on thread 0 */ + if (local_paca->kvm_hstate.tid == 0) { + sip->do_set = 0; + asm volatile("ptesync" : : : "memory"); + for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) { + rb = TLBIEL_INVAL_SET_LPID + + (set << TLBIEL_INVAL_SET_SHIFT); + asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : : + "r" (rb), "r" (0)); + } + asm volatile("ptesync" : : : "memory"); + } + + /* indicate that we have done so and wait for others */ + wait_for_sync(sip, PHASE_SET_LPCR); + /* order read of sip->lpcr_sync.allphases vs. sip->do_set */ + smp_rmb(); +} + +/* + * Called when a thread that has been in the guest needs + * to reload the host LPCR value - but only on POWER9 when + * running a HPT guest on a radix host. + */ +void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) +{ + /* we're out of the guest... */ + wait_for_sync(sip, PHASE_OUT_OF_GUEST); + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_LPCR, sip->host_lpcr); + isync(); + + if (local_paca->kvm_hstate.tid == 0) { + sip->do_restore = 0; + smp_wmb(); /* order store of do_restore vs. phase */ + } + + wait_for_sync(sip, PHASE_RESET_LPCR); + smp_mb(); + local_paca->kvm_hstate.kvm_split_mode = NULL; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4efe364f1188..26c11f678fbf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -/* Update the changed page order field of an rmap entry */ -void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) +/* Update the dirty bitmap of a memslot */ +void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned long psize) { - unsigned long order; + unsigned long npages; - if (!psize) + if (!psize || !memslot->dirty_bitmap) return; - order = ilog2(psize); - order <<= KVMPPC_RMAP_CHG_SHIFT; - if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) - *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; + npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE; + gfn -= memslot->base_gfn; + set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages); +} +EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map); + +static void kvmppc_set_dirty_from_hpte(struct kvm *kvm, + unsigned long hpte_v, unsigned long hpte_gr) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long psize; + + psize = kvmppc_actual_pgsz(hpte_v, hpte_gr); + gfn = hpte_rpn(hpte_gr, psize); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); } -EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change); /* Returns a pointer to the revmap entry for the page mapped by a HPTE */ static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, - unsigned long hpte_gr) + unsigned long hpte_gr, + struct kvm_memory_slot **memslotp, + unsigned long *gfnp) { struct kvm_memory_slot *memslot; unsigned long *rmap; unsigned long gfn; - gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); + gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr)); memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslotp) + *memslotp = memslot; + if (gfnp) + *gfnp = gfn; if (!memslot) return NULL; @@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unsigned long ptel, head; unsigned long *rmap; unsigned long rcbits; + struct kvm_memory_slot *memslot; + unsigned long gfn; rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; - rmap = revmap_for_hpte(kvm, hpte_v, ptel); + rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn); if (!rmap) return; lock_rmap(rmap); @@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, } *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); + kvmppc_update_dirty_map(memslot, gfn, + kvmppc_actual_pgsz(hpte_v, hpte_r)); unlock_rmap(rmap); } @@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - psize = hpte_page_size(pteh, ptel); + psize = kvmppc_actual_pgsz(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); @@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_R) { kvmppc_clear_ref_hpte(kvm, hpte, pte_index); - rmap = revmap_for_hpte(kvm, v, gr); + rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL); if (rmap) { lock_rmap(rmap); *rmap |= KVMPPC_RMAP_REFERENCED; @@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, __be64 *hpte; unsigned long v, r, gr; struct revmap_entry *rev; - unsigned long *rmap; long ret = H_NOT_FOUND; if (kvm_is_radix(kvm)) @@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, r = be64_to_cpu(hpte[1]); gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_C) { - unsigned long psize = hpte_page_size(v, r); hpte[1] = cpu_to_be64(r & ~HPTE_R_C); eieio(); - rmap = revmap_for_hpte(kvm, v, gr); - if (rmap) { - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - kvmppc_update_rmap_change(rmap, psize); - unlock_rmap(rmap); - } + kvmppc_set_dirty_from_hpte(kvm, v, gr); } } vcpu->arch.gpr[4] = gr; @@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, * Check the HPTE again, including base page size */ if ((v & valid) && (v & mask) == val && - hpte_base_page_size(v, r) == (1ul << pshift)) + kvmppc_hpte_base_page_shift(v, r) == pshift) /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 68bf0f14a962..2659844784b8 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -31,6 +31,7 @@ #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> +#include <asm/thread_info.h> /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) RFI kvmppc_call_hv_entry: +BEGIN_FTR_SECTION + /* On P9, do LPCR setting, if necessary */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 46f + lwz r4, KVM_SPLIT_DO_SET(r3) + cmpwi r4, 0 + beq 46f + bl kvmhv_p9_set_lpcr + nop +46: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_hv_entry @@ -387,6 +401,7 @@ kvm_secondary_got_guest: ld r6, HSTATE_SPLIT_MODE(r13) cmpdi r6, 0 beq 63f +BEGIN_FTR_SECTION ld r0, KVM_SPLIT_RPR(r6) mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) @@ -394,6 +409,15 @@ kvm_secondary_got_guest: ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 isync +FTR_SECTION_ELSE + /* On P9 we use the split_info for coordinating LPCR changes */ + lwz r4, KVM_SPLIT_DO_SET(r6) + cmpwi r4, 0 + beq 63f + mr r3, r6 + bl kvmhv_p9_set_lpcr + nop +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 63: /* Order load of vcpu after load of vcore */ lwsync @@ -464,6 +488,12 @@ kvm_no_guest: ld r3, HSTATE_SPLIT_MODE(r13) cmpdi r3, 0 beq kvm_no_guest + lwz r0, KVM_SPLIT_DO_SET(r3) + cmpwi r0, 0 + bne kvmhv_do_set + lwz r0, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r0, 0 + bne kvmhv_do_restore lbz r0, KVM_SPLIT_DO_NAP(r3) cmpwi r0, 0 beq kvm_no_guest @@ -476,6 +506,19 @@ kvm_no_guest: stb r0, HSTATE_HWTHREAD_STATE(r13) b kvm_no_guest +kvmhv_do_set: + /* Set LPCR, LPIDR etc. on P9 */ + HMT_MEDIUM + bl kvmhv_p9_set_lpcr + nop + b kvm_no_guest + +kvmhv_do_restore: + HMT_MEDIUM + bl kvmhv_p9_restore_lpcr + nop + b kvm_no_guest + /* * Here the primary thread is trying to return the core to * whole-core mode, so we need to nap. @@ -513,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Set kvm_split_mode.napped[tid] = 1 */ ld r3, HSTATE_SPLIT_MODE(r13) li r0, 1 - lhz r4, PACAPACAINDEX(r13) - clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ + lbz r4, HSTATE_TID(r13) addi r4, r4, KVM_SPLIT_NAPPED stbx r0, r3, r4 /* Check the do_nap flag again after setting napped[] */ @@ -1911,10 +1953,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 -16: ld r8,KVM_HOST_LPCR(r4) +16: +BEGIN_FTR_SECTION + /* On POWER9 with HPT-on-radix we need to wait for all other threads */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 47f + lwz r8, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r8, 0 + beq 47f + stw r12, STACK_SLOT_TRAP(r1) + bl kvmhv_p9_restore_lpcr + nop + lwz r12, STACK_SLOT_TRAP(r1) + b 48f +47: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync - +48: /* load host SLB entries */ BEGIN_MMU_FTR_SECTION b 0f @@ -3133,10 +3191,139 @@ kvmppc_restore_tm: /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. - * For now just spin, but we should do something better. + * r12 is (CR << 32) | vector + * r13 points to our PACA + * r12 is saved in HSTATE_SCRATCH0(r13) + * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE + * r9 is saved in HSTATE_SCRATCH2(r13) + * r13 is saved in HSPRG1 + * cfar is saved in HSTATE_CFAR(r13) + * ppr is saved in HSTATE_PPR(r13) */ kvmppc_bad_host_intr: + /* + * Switch to the emergency stack, but start half-way down in + * case we were already on it. + */ + mr r9, r1 + std r1, PACAR1(r13) + ld r1, PACAEMERGSP(r13) + subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE + std r9, 0(r1) + std r0, GPR0(r1) + std r9, GPR1(r1) + std r2, GPR2(r1) + SAVE_4GPRS(3, r1) + SAVE_2GPRS(7, r1) + srdi r0, r12, 32 + clrldi r12, r12, 32 + std r0, _CCR(r1) + std r12, _TRAP(r1) + andi. r0, r12, 2 + beq 1f + mfspr r3, SPRN_HSRR0 + mfspr r4, SPRN_HSRR1 + mfspr r5, SPRN_HDAR + mfspr r6, SPRN_HDSISR + b 2f +1: mfspr r3, SPRN_SRR0 + mfspr r4, SPRN_SRR1 + mfspr r5, SPRN_DAR + mfspr r6, SPRN_DSISR +2: std r3, _NIP(r1) + std r4, _MSR(r1) + std r5, _DAR(r1) + std r6, _DSISR(r1) + ld r9, HSTATE_SCRATCH2(r13) + ld r12, HSTATE_SCRATCH0(r13) + GET_SCRATCH0(r0) + SAVE_4GPRS(9, r1) + std r0, GPR13(r1) + SAVE_NVGPRS(r1) + ld r5, HSTATE_CFAR(r13) + std r5, ORIG_GPR3(r1) + mflr r3 +#ifdef CONFIG_RELOCATABLE + ld r4, HSTATE_SCRATCH1(r13) +#else + mfctr r4 +#endif + mfxer r5 + lbz r6, PACASOFTIRQEN(r13) + std r3, _LINK(r1) + std r4, _CTR(r1) + std r5, _XER(r1) + std r6, SOFTE(r1) + ld r2, PACATOC(r13) + LOAD_REG_IMMEDIATE(3, 0x7265677368657265) + std r3, STACK_FRAME_OVERHEAD-16(r1) + + /* + * On POWER9 do a minimal restore of the MMU and call C code, + * which will print a message and panic. + * XXX On POWER7 and POWER8, we just spin here since we don't + * know what the other threads are doing (and we don't want to + * coordinate with them) - but at least we now have register state + * in memory that we might be able to look at from another CPU. + */ +BEGIN_FTR_SECTION b . +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_KVM(r9) + + li r0, 0 + mtspr SPRN_AMR, r0 + mtspr SPRN_IAMR, r0 + mtspr SPRN_CIABR, r0 + mtspr SPRN_DAWRX, r0 + + /* Flush the ERAT on radix P9 DD1 guest exit */ +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) + +BEGIN_MMU_FTR_SECTION + b 4f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) + + slbmte r0, r0 + slbia + ptesync + ld r8, PACA_SLBSHADOWPTR(r13) + .rept SLB_NUM_BOLTED + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addi r3, r3, 8 + LDX_BE r6, r8, r3 + andis. r7, r5, SLB_ESID_V@h + beq 3f + slbmte r6, r5 +3: addi r8, r8, 16 + .endr + +4: lwz r7, KVM_HOST_LPID(r10) + mtspr SPRN_LPID, r7 + mtspr SPRN_PID, r0 + ld r8, KVM_HOST_LPCR(r10) + mtspr SPRN_LPCR, r8 + isync + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + /* + * Turn on the MMU and jump to C code + */ + bcl 20, 31, .+4 +5: mflr r3 + addi r3, r3, 9f - 5b + ld r4, PACAKMSR(r13) + mtspr SPRN_SRR0, r3 + mtspr SPRN_SRR1, r4 + rfid +9: addi r3, r1, STACK_FRAME_OVERHEAD + bl kvmppc_bad_interrupt + b 9b /* * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 69a09444d46e..d0dc8624198f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; +#ifdef CONFIG_PPC_BOOK3S_64 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + /* Flush all SLB entries */ + vcpu->arch.mmu.slbmte(vcpu, 0, 0); + vcpu->arch.mmu.slbia(vcpu); + for (i = 0; i < 64; i++) { - vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, - sregs->u.s.ppc64.slb[i].slbe); + u64 rb = sregs->u.s.ppc64.slb[i].slbe; + u64 rs = sregs->u.s.ppc64.slb[i].slbv; + + if (rb & SLB_ESID_V) + vcpu->arch.mmu.slbmte(vcpu, rs, rb); } - } else { + } else +#endif + { for (i = 0; i < 16; i++) { vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); } diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 8a4205fa774f..dae3be5ff42b 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_PROTECT: case H_BULK_REMOVE: case H_PUT_TCE: + case H_PUT_TCE_INDIRECT: + case H_STUFF_TCE: case H_CEDE: case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c6c734424c70..423b21393bc9 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, start = vma->vm_pgoff; end = start + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + vma_pages(vma); pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1abe6eb51335..6b6c53c42ac9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && !radix_enabled() && - cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); break; #endif case KVM_CAP_SYNC_MMU: |