diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:06:27 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:06:27 -0800 |
commit | 051089a2eed9a9977080774f3793ff2688cd3878 (patch) | |
tree | 29e23a60ea7e98633a3eef8dd436b57d7d41986d /arch/x86 | |
parent | 974aa5630b318938273d7efe7a2cf031c7b927db (diff) | |
parent | 646d944c2ef5a3b298c4e150494c71b9272d8b47 (diff) |
Merge tag 'for-linus-4.15-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from Juergen Gross:
"Xen features and fixes for v4.15-rc1
Apart from several small fixes it contains the following features:
- a series by Joao Martins to add vdso support of the pv clock
interface
- a series by Juergen Gross to add support for Xen pv guests to be
able to run on 5 level paging hosts
- a series by Stefano Stabellini adding the Xen pvcalls frontend
driver using a paravirtualized socket interface"
* tag 'for-linus-4.15-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (34 commits)
xen/pvcalls: fix potential endless loop in pvcalls-front.c
xen/pvcalls: Add MODULE_LICENSE()
MAINTAINERS: xen, kvm: track pvclock-abi.h changes
x86/xen/time: setup vcpu 0 time info page
x86/xen/time: set pvclock flags on xen_time_init()
x86/pvclock: add setter for pvclock_pvti_cpu0_va
ptp_kvm: probe for kvm guest availability
xen/privcmd: remove unused variable pageidx
xen: select grant interface version
xen: update arch/x86/include/asm/xen/cpuid.h
xen: add grant interface version dependent constants to gnttab_ops
xen: limit grant v2 interface to the v1 functionality
xen: re-introduce support for grant v2 interface
xen: support priv-mapping in an HVM tools domain
xen/pvcalls: remove redundant check for irq >= 0
xen/pvcalls: fix unsigned less than zero error check
xen/time: Return -ENODEV from xen_get_wallclock()
xen/pvcalls-front: mark expected switch fall-through
xen: xenbus_probe_frontend: mark expected switch fall-throughs
xen/time: do not decrease steal time after live migration on xen
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/pvclock.h | 19 | ||||
-rw-r--r-- | arch/x86/include/asm/xen/cpuid.h | 42 | ||||
-rw-r--r-- | arch/x86/include/asm/xen/page.h | 11 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/pvclock.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/grant-table.c | 60 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/mmu_pv.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 99 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 2 |
12 files changed, 241 insertions, 37 deletions
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index d63053142b16..5b8b556dbb12 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -112,7 +112,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = - pvclock_pvti_cpu0_va(); + pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn_prot( vma, diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 3e4ed8fb5f91..a7471dcd2205 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -5,15 +5,6 @@ #include <linux/clocksource.h> #include <asm/pvclock-abi.h> -#ifdef CONFIG_KVM_GUEST -extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); -#else -static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return NULL; -} -#endif - /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); @@ -102,4 +93,14 @@ struct pvclock_vsyscall_time_info { #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#ifdef CONFIG_PARAVIRT_CLOCK +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti); +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 3bdd10d71223..a9630104f1c4 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -74,21 +74,43 @@ #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) /* + * Leaf 4 (0x40000x03) + * Sub-leaf 0: EAX: bit 0: emulated tsc + * bit 1: host tsc is known to be reliable + * bit 2: RDTSCP instruction available + * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate, + * 2=no emulation, 3=no emulation + TSC_AUX support + * ECX: guest tsc frequency in kHz + * EDX: guest tsc incarnation (migration count) + * Sub-leaf 1: EAX: tsc offset low part + * EBX: tsc offset high part + * ECX: multiplicator for tsc->ns conversion + * EDX: shift amount for tsc->ns conversion + * Sub-leaf 2: EAX: host tsc frequency in kHz + */ + +/* * Leaf 5 (0x40000x04) * HVM-specific features - * EAX: Features - * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + * Sub-leaf 0: EAX: Features + * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) */ - -/* Virtualized APIC registers */ -#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) -/* Virtualized x2APIC accesses */ -#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ /* Memory mapped from other domains has valid IOMMU entries */ #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) -/* vcpu id is present in EBX */ -#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) +#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ + +/* + * Leaf 6 (0x40000x05) + * PV-specific parameters + * Sub-leaf 0: EAX: max available sub-leaf + * Sub-leaf 0: EBX: bits 0-7: max machine address width + */ + +/* Max. address width in bits taking memory hotplug into account. */ +#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0) -#define XEN_CPUID_MAX_NUM_LEAVES 4 +#define XEN_CPUID_MAX_NUM_LEAVES 5 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c6b84245e5ab..123e669bf363 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -27,6 +27,15 @@ typedef struct xpaddr { phys_addr_t paddr; } xpaddr_t; +#ifdef CONFIG_X86_64 +#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1) +#else +#define XEN_PHYSICAL_MASK __PHYSICAL_MASK +#endif + +#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \ + XEN_PHYSICAL_MASK)) + #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) @@ -278,7 +287,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) static inline unsigned long pte_mfn(pte_t pte) { - return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; } static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 77b492c2d658..8b26c9e01cc4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -48,12 +48,6 @@ early_param("no-kvmclock", parse_no_kvmclock); static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock *wall_clock; -struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return hv_clock; -} -EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); - /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -377,6 +371,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } + pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5c3f6d6a5078..761f6af6efa5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -25,8 +25,10 @@ #include <asm/fixmap.h> #include <asm/pvclock.h> +#include <asm/vgtod.h> static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; void pvclock_set_flags(u8 flags) { @@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 809b6c812654..92ccc718152d 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -49,7 +49,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area; +} gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, @@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return 0; } +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + grant_status_t *shared = *__shared; + unsigned long addr; + unsigned long i; + + if (shared == NULL) + *__shared = shared = gnttab_status_vm_area.area->addr; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); + addr += PAGE_SIZE; + } + + return 0; +} + void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { + pte_t **ptes; unsigned long addr; unsigned long i; + if (shared == gnttab_status_vm_area.area->addr) + ptes = gnttab_status_vm_area.ptes; + else + ptes = gnttab_shared_vm_area.ptes; + addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], - __pte(0)); + set_pte_at(&init_mm, addr, ptes[i], __pte(0)); addr += PAGE_SIZE; } } @@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) return 0; } -int arch_gnttab_init(unsigned long nr_shared) +static void arch_gnttab_vfree(struct gnttab_vm_area *area) { + free_vm_area(area->area); + kfree(area->ptes); +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +{ + int ret; + if (!xen_pv_domain()) return 0; - return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + if (ret < 0) + return ret; + + /* + * Always allocate the space for the status frames in case + * we're migrated to a host with V2 support. + */ + ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); + if (ret < 0) + goto err; + + return 0; +err: + arch_gnttab_vfree(&gnttab_shared_vm_area); + return -ENOMEM; } #ifdef CONFIG_XEN_PVH diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3e15345abfe7..d33e7dbe3129 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EOPNOTSUPP; + return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages); } EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); @@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, + prot, domid, pages); + /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, * and the consequences later is quite hard to detect what the actual * cause of "wrong memory was mapped in". @@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, - int numpgs, struct page **pages) + int nr, struct page **pages) { - if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_unmap_gfn_range(vma, nr, pages); + + if (!pages) return 0; return -EINVAL; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2ccdaba31a07..fc048ec686e7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); pteval_t flags = val & PTE_FLAGS_MASK; @@ -1721,7 +1721,7 @@ static unsigned long __init m2p(phys_addr_t maddr) { phys_addr_t paddr; - maddr &= PTE_PFN_MASK; + maddr &= XEN_PTE_MFN_MASK; paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; return paddr; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 92bf5ecb6baf..d9f96cc5d743 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -17,6 +17,8 @@ void xen_arch_pre_suspend(void) { + xen_save_time_memory_area(); + if (xen_pv_domain()) xen_pv_pre_suspend(); } @@ -27,6 +29,8 @@ void xen_arch_post_suspend(int cancelled) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); + + xen_restore_time_memory_area(); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 80c2a4bdf230..29163c43ebbd 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -75,7 +75,7 @@ static void xen_get_wallclock(struct timespec *now) static int xen_set_wallclock(const struct timespec *now) { - return -1; + return -ENODEV; } static int xen_pvclock_gtod_notify(struct notifier_block *nb, @@ -371,8 +371,95 @@ static const struct pv_time_ops xen_time_ops __initconst = { .steal_clock = xen_steal_clock, }; +static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; + +void xen_save_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = NULL; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret != 0) + pr_notice("Cannot save secondary vcpu_time_info (err %d)", + ret); + else + clear_page(xen_clock); +} + +void xen_restore_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = &xen_clock->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + + /* + * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the + * secondary time info with Xen or if we migrated to a host without the + * necessary flags. On both of these cases what happens is either + * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT + * bit set. Userspace checks the latter and if 0, it discards the data + * in pvti and fallbacks to a system call for a reliable timestamp. + */ + if (ret != 0) + pr_notice("Cannot restore secondary vcpu_time_info (err %d)", + ret); +} + +static void xen_setup_vsyscall_time_info(void) +{ + struct vcpu_register_time_memory_area t; + struct pvclock_vsyscall_time_info *ti; + int ret; + + ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL); + if (!ti) + return; + + t.addr.v = &ti->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret) { + pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret); + free_page((unsigned long)ti); + return; + } + + /* + * If primary time info had this bit set, secondary should too since + * it's the same data on both just different memory regions. But we + * still check it in case hypervisor is buggy. + */ + if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) { + t.addr.v = NULL; + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, + 0, &t); + if (!ret) + free_page((unsigned long)ti); + + pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n"); + return; + } + + xen_clock = ti; + pvclock_set_pvti_cpu0_va(xen_clock); + + xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK; +} + static void __init xen_time_init(void) { + struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); struct timespec tp; @@ -396,6 +483,16 @@ static void __init xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + /* + * We check ahead on the primary time info if this + * bit is supported hence speeding up Xen clocksource. + */ + pvti = &__this_cpu_read(xen_vcpu)->time; + if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) { + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + xen_setup_vsyscall_time_info(); + } + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f377e1820c6c..75011b80660f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -70,6 +70,8 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); +void xen_save_time_memory_area(void); +void xen_restore_time_memory_area(void); void __init xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); |