diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 50 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/gup.c | 7 | ||||
-rw-r--r-- | mm/hmm.c | 587 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 13 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 49 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 28 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 | ||||
-rw-r--r-- | mm/swap.c | 13 |
13 files changed, 171 insertions, 606 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0b4352557dd5..495d7368ced8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -670,47 +670,17 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config ARCH_HAS_HMM_MIRROR - bool - default y - depends on (X86_64 || PPC64) - depends on MMU && 64BIT - -config ARCH_HAS_HMM_DEVICE - bool - default y - depends on (X86_64 || PPC64) - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE - select XARRAY_MULTI - -config ARCH_HAS_HMM - bool - default y - depends on (X86_64 || PPC64) - depends on ZONE_DEVICE - depends on MMU && 64BIT - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - config MIGRATE_VMA_HELPER bool config DEV_PAGEMAP_OPS bool -config HMM - bool - select MMU_NOTIFIER - select MIGRATE_VMA_HELPER - config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" - depends on ARCH_HAS_HMM - select HMM + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + select MMU_NOTIFIER help Select HMM_MIRROR if you want to mirror range of the CPU page table of a process into a device page table. Here, mirror means "keep synchronized". @@ -720,8 +690,7 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" - depends on ARCH_HAS_HMM - select HMM + depends on ZONE_DEVICE select DEV_PAGEMAP_OPS help @@ -729,17 +698,6 @@ config DEVICE_PRIVATE memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. -config DEVICE_PUBLIC - bool "Addressable device memory (like GPU memory)" - depends on ARCH_HAS_HMM - select HMM - select DEV_PAGEMAP_OPS - - help - Allows creation of struct pages to represent addressable device - memory; i.e., memory that is accessible from both the device and - the CPU - config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index dc0746ca1109..338e528ad436 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o -obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o @@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); - - /* - * This should never happen (a device public page in the gate - * area). - */ - if (is_device_public_page(*page)) - goto unmap; } if (unlikely(!try_get_page(*page))) { ret = -ENOMEM; @@ -20,26 +20,14 @@ #include <linux/swapops.h> #include <linux/hugetlb.h> #include <linux/memremap.h> +#include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -static inline struct hmm *mm_get_hmm(struct mm_struct *mm) -{ - struct hmm *hmm = READ_ONCE(mm->hmm); - - if (hmm && kref_get_unless_zero(&hmm->kref)) - return hmm; - - return NULL; -} - /** * hmm_get_or_create - register HMM against an mm (HMM internal) * @@ -54,11 +42,16 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm) */ static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); - bool cleanup = false; + struct hmm *hmm; - if (hmm) - return hmm; + lockdep_assert_held_write(&mm->mmap_sem); + + /* Abuse the page_table_lock to also protect mm->hmm. */ + spin_lock(&mm->page_table_lock); + hmm = mm->hmm; + if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) + goto out_unlock; + spin_unlock(&mm->page_table_lock); hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) @@ -68,55 +61,50 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - mutex_init(&hmm->lock); + spin_lock_init(&hmm->ranges_lock); kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->dead = false; hmm->mm = mm; - spin_lock(&mm->page_table_lock); - if (!mm->hmm) - mm->hmm = hmm; - else - cleanup = true; - spin_unlock(&mm->page_table_lock); + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } - if (cleanup) - goto error; + mmgrab(hmm->mm); /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() + * We hold the exclusive mmap_sem here so we know that mm->hmm is + * still NULL or 0 kref, and is safe to update. */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) - goto error_mm; + spin_lock(&mm->page_table_lock); + mm->hmm = hmm; +out_unlock: + spin_unlock(&mm->page_table_lock); return hmm; +} -error_mm: - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); -error: +static void hmm_free_rcu(struct rcu_head *rcu) +{ + struct hmm *hmm = container_of(rcu, struct hmm, rcu); + + mmdrop(hmm->mm); kfree(hmm); - return NULL; } static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); - struct mm_struct *mm = hmm->mm; - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&hmm->mm->page_table_lock); + if (hmm->mm->hmm == hmm) + hmm->mm->hmm = NULL; + spin_unlock(&hmm->mm->page_table_lock); - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); + mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); } static inline void hmm_put(struct hmm *hmm) @@ -124,86 +112,73 @@ static inline void hmm_put(struct hmm *hmm) kref_put(&hmm->kref, hmm_free); } -void hmm_mm_destroy(struct mm_struct *mm) +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct hmm *hmm; + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); + struct hmm_mirror *mirror; - spin_lock(&mm->page_table_lock); - hmm = mm_get_hmm(mm); - mm->hmm = NULL; - if (hmm) { - hmm->mm = NULL; - hmm->dead = true; - spin_unlock(&mm->page_table_lock); - hmm_put(hmm); + /* Bail out if hmm is in the process of being freed */ + if (!kref_get_unless_zero(&hmm->kref)) return; + + /* + * Since hmm_range_register() holds the mmget() lock hmm_release() is + * prevented as long as a range exists. + */ + WARN_ON(!list_empty_careful(&hmm->ranges)); + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) { + /* + * Note: The driver is not allowed to trigger + * hmm_mirror_unregister() from this thread. + */ + if (mirror->ops->release) + mirror->ops->release(mirror); } + up_read(&hmm->mirrors_sem); - spin_unlock(&mm->page_table_lock); + hmm_put(hmm); } -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +static void notifiers_decrement(struct hmm *hmm) { - struct hmm *hmm = mm_get_hmm(mm); - struct hmm_mirror *mirror; - struct hmm_range *range; - - /* Report this HMM as dying. */ - hmm->dead = true; + unsigned long flags; - /* Wake-up everyone waiting on any range. */ - mutex_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - range->valid = false; - } - wake_up_all(&hmm->wq); - mutex_unlock(&hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; - down_write(&hmm->mirrors_sem); - mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, - list); - while (mirror) { - list_del_init(&mirror->list); - if (mirror->ops->release) { - /* - * Drop mirrors_sem so callback can wait on any pending - * work that might itself trigger mmu_notifier callback - * and thus would deadlock with us. - */ - up_write(&hmm->mirrors_sem); - mirror->ops->release(mirror); - down_write(&hmm->mirrors_sem); + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; } - mirror = list_first_entry_or_null(&hmm->mirrors, - struct hmm_mirror, list); + wake_up_all(&hmm->wq); } - up_write(&hmm->mirrors_sem); - - hmm_put(hmm); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; + unsigned long flags; int ret = 0; - VM_BUG_ON(!hmm); + if (!kref_get_unless_zero(&hmm->kref)) + return 0; update.start = nrange->start; update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; update.blockable = mmu_notifier_range_blockable(nrange); - if (mmu_notifier_range_blockable(nrange)) - mutex_lock(&hmm->lock); - else if (!mutex_trylock(&hmm->lock)) { - ret = -EAGAIN; - goto out; - } + spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { if (update.end < range->start || update.start >= range->end) @@ -211,7 +186,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, range->valid = false; } - mutex_unlock(&hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); if (mmu_notifier_range_blockable(nrange)) down_read(&hmm->mirrors_sem); @@ -219,19 +194,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, ret = -EAGAIN; goto out; } + list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; + int rc; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); - if (!update.blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (rc) { + if (WARN_ON(update.blockable || rc != -EAGAIN)) + continue; ret = -EAGAIN; - goto out; + break; } } up_read(&hmm->mirrors_sem); out: + if (ret) + notifiers_decrement(hmm); hmm_put(hmm); return ret; } @@ -239,24 +218,12 @@ out: static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); - - VM_BUG_ON(!hmm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - mutex_lock(&hmm->lock); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - mutex_unlock(&hmm->lock); + if (!kref_get_unless_zero(&hmm->kref)) + return; + notifiers_decrement(hmm); hmm_put(hmm); } @@ -271,14 +238,15 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * @mirror: new mirror struct to register * @mm: mm to register against + * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. - * - * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + lockdep_assert_held_write(&mm->mmap_sem); + /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; @@ -298,23 +266,17 @@ EXPORT_SYMBOL(hmm_mirror_register); /* * hmm_mirror_unregister() - unregister a mirror * - * @mirror: new mirror struct to register + * @mirror: mirror struct to unregister * * Stop mirroring a process address space, and cleanup. */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = READ_ONCE(mirror->hmm); - - if (hmm == NULL) - return; + struct hmm *hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del_init(&mirror->list); - /* To protect us against double unregister ... */ - mirror->hmm = NULL; + list_del(&mirror->list); up_write(&hmm->mirrors_sem); - hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -330,7 +292,7 @@ struct hmm_vma_walk { static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, bool write_fault, uint64_t *pfn) { - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + unsigned int flags = FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; @@ -372,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EBUSY after page fault, or page fault error + * Return: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -550,7 +512,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { - if (pte_none(pte) || !pte_present(pte)) + if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; return pte_write(pte) ? range->flags[HMM_PFN_VALID] | range->flags[HMM_PFN_WRITE] : @@ -788,7 +750,6 @@ again: return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); -#ifdef CONFIG_HUGETLB_PAGE pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, @@ -804,9 +765,6 @@ again: } hmm_vma_walk->last = end; return 0; -#else - return -EINVAL; -#endif } split_huge_pud(walk->vma, pudp, addr); @@ -909,12 +867,14 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); + struct hmm *hmm = mirror->hmm; + unsigned long flags; range->valid = false; range->hmm = NULL; @@ -928,28 +888,24 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - range->hmm = hmm_get_or_create(mm); - if (!range->hmm) - return -EFAULT; - - /* Check if hmm_mm_destroy() was call. */ - if (range->hmm->mm == NULL || range->hmm->dead) { - hmm_put(range->hmm); + /* Prevent hmm_release() from running while the range is valid */ + if (!mmget_not_zero(hmm->mm)) return -EFAULT; - } - /* Initialize range to track CPU page table update */ - mutex_lock(&range->hmm->lock); + /* Initialize range to track CPU page table updates. */ + spin_lock_irqsave(&hmm->ranges_lock, flags); - list_add_rcu(&range->list, &range->hmm->ranges); + range->hmm = hmm; + kref_get(&hmm->kref); + list_add(&range->list, &hmm->ranges); /* * If there are any concurrent notifiers we have to wait for them for * the range to be valid (see hmm_range_wait_until_valid()). */ - if (!range->hmm->notifiers) + if (!hmm->notifiers) range->valid = true; - mutex_unlock(&range->hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); return 0; } @@ -964,25 +920,31 @@ EXPORT_SYMBOL(hmm_range_register); */ void hmm_range_unregister(struct hmm_range *range) { - /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) - return; + struct hmm *hmm = range->hmm; + unsigned long flags; - mutex_lock(&range->hmm->lock); - list_del_rcu(&range->list); - mutex_unlock(&range->hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + list_del_init(&range->list); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); /* Drop reference taken by hmm_range_register() */ + mmput(hmm->mm); + hmm_put(hmm); + + /* + * The range is now invalid and the ref on the hmm is dropped, so + * poison the pointer. Leave other fields in place, for the caller's + * use. + */ range->valid = false; - hmm_put(range->hmm); - range->hmm = NULL; + memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); } EXPORT_SYMBOL(hmm_range_unregister); /* * hmm_range_snapshot() - snapshot CPU page table for a range * @range: range - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid * permission (for instance asking for write and range is read only), * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid * vma or it is illegal to access that range), number of valid pages @@ -1001,10 +963,7 @@ long hmm_range_snapshot(struct hmm_range *range) struct vm_area_struct *vma; struct mm_walk mm_walk; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; - + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) @@ -1015,9 +974,8 @@ long hmm_range_snapshot(struct hmm_range *range) return -EFAULT; if (is_vm_hugetlb_page(vma)) { - struct hstate *h = hstate_vma(vma); - - if (huge_page_shift(h) != range->page_shift && + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && range->page_shift != PAGE_SHIFT) return -EINVAL; } else { @@ -1066,7 +1024,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of valid pages in range->pfns[] (from range start + * Return: number of valid pages in range->pfns[] (from range start * address). This may be zero. If the return value is negative, * then one of the following values may be returned: * @@ -1100,9 +1058,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) struct mm_walk mm_walk; int ret; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ @@ -1184,7 +1140,7 @@ EXPORT_SYMBOL(hmm_range_fault); * @device: device against to dma map page to * @daddrs: dma address of mapped pages * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been * drop and you need to try again, some other error value otherwise * * Note same usage pattern as hmm_range_fault(). @@ -1272,7 +1228,7 @@ EXPORT_SYMBOL(hmm_range_dma_map); * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set - * Returns: number of page unmapped on success, -EINVAL otherwise + * Return: number of page unmapped on success, -EINVAL otherwise * * Note that caller MUST abide by mmu notifier or use HMM mirror and abide * to the sync_cpu_device_pagetables() callback so that it is safe here to @@ -1328,284 +1284,3 @@ long hmm_range_dma_unmap(struct hmm_range *range, return cpages; } EXPORT_SYMBOL(hmm_range_dma_unmap); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!page) - return NULL; - lock_page(page); - return page; -} -EXPORT_SYMBOL(hmm_vma_alloc_locked_page); - - -static void hmm_devmem_ref_release(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - complete(&devmem->completion); -} - -static void hmm_devmem_ref_exit(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); -} - -static void hmm_devmem_ref_kill(struct percpu_ref *ref) -{ - percpu_ref_kill(ref); -} - -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) -{ - struct hmm_devmem *devmem = page->pgmap->data; - - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); -} - -static void hmm_devmem_free(struct page *page, void *data) -{ - struct hmm_devmem *devmem = data; - - page->mapping = NULL; - - devmem->ops->free(devmem, page); -} - -/* - * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory - * - * @ops: memory event device driver callback (see struct hmm_devmem_ops) - * @device: device struct to bind the resource too - * @size: size in bytes of the device memory to add - * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise - * - * This function first finds an empty range of physical address big enough to - * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which - * in turn allocates struct pages. It does not do anything beyond that; all - * events affecting the memory will go through the various callbacks provided - * by hmm_devmem_ops struct. - * - * Device driver should call this function during device initialization and - * is then responsible of memory management. HMM only provides helpers. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size) -{ - struct hmm_devmem *devmem; - resource_size_t addr; - void *result; - int ret; - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = NULL; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - size = ALIGN(size, PA_SECTION_SIZE); - addr = min((unsigned long)iomem_resource.end, - (1UL << MAX_PHYSMEM_BITS) - 1); - addr = addr - size + 1UL; - - /* - * FIXME add a new helper to quickly walk resource tree and find free - * range - * - * FIXME what about ioport_resource resource ? - */ - for (; addr > size && addr >= iomem_resource.start; addr -= size) { - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); - if (ret != REGION_DISJOINT) - continue; - - devmem->resource = devm_request_mem_region(device, addr, size, - dev_name(device)); - if (!devmem->resource) - return ERR_PTR(-ENOMEM); - break; - } - if (!devmem->resource) - return ERR_PTR(-ERANGE); - - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add); - -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res) -{ - struct hmm_devmem *devmem; - void *result; - int ret; - - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) - return ERR_PTR(-EINVAL); - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = res; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper - * and it is not needed to make use of any HMM functionality. - */ -#define HMM_DEVICE_MAX 256 - -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); -static DEFINE_SPINLOCK(hmm_device_lock); -static struct class *hmm_device_class; -static dev_t hmm_device_devt; - -static void hmm_device_release(struct device *device) -{ - struct hmm_device *hmm_device; - - hmm_device = container_of(device, struct hmm_device, device); - spin_lock(&hmm_device_lock); - clear_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - kfree(hmm_device); -} - -struct hmm_device *hmm_device_new(void *drvdata) -{ - struct hmm_device *hmm_device; - - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); - if (!hmm_device) - return ERR_PTR(-ENOMEM); - - spin_lock(&hmm_device_lock); - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); - if (hmm_device->minor >= HMM_DEVICE_MAX) { - spin_unlock(&hmm_device_lock); - kfree(hmm_device); - return ERR_PTR(-EBUSY); - } - set_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), - hmm_device->minor); - hmm_device->device.release = hmm_device_release; - dev_set_drvdata(&hmm_device->device, drvdata); - hmm_device->device.class = hmm_device_class; - device_initialize(&hmm_device->device); - - return hmm_device; -} -EXPORT_SYMBOL(hmm_device_new); - -void hmm_device_put(struct hmm_device *hmm_device) -{ - put_device(&hmm_device->device); -} -EXPORT_SYMBOL(hmm_device_put); - -static int __init hmm_init(void) -{ - int ret; - - ret = alloc_chrdev_region(&hmm_device_devt, 0, - HMM_DEVICE_MAX, - "hmm_device"); - if (ret) - return ret; - - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); - if (IS_ERR(hmm_device_class)) { - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); - return PTR_ERR(hmm_device_class); - } - return 0; -} - -device_initcall(hmm_init); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/madvise.c b/mm/madvise.c index 628022e674a7..968df3aa069f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = _vm_normal_page(vma, addr, ptent, true); + page = vm_normal_page(vma, addr, ptent); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f05735b02d3..249671873aa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4908,7 +4908,7 @@ enum mc_target_type { static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = _vm_normal_page(vma, addr, ptent, true); + struct page *page = vm_normal_page(vma, addr, ptent); if (!page || !page_mapped(page)) return NULL; @@ -5109,8 +5109,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC - * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE + * (so ZONE_DEVICE page and thus not on the lru). * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5144,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_public_page(page)) + if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; @@ -5216,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (ptl) { /* * Note their can not be MC_TARGET_DEVICE for now as we do not - * support transparent huge page with MEMORY_DEVICE_PUBLIC or - * MEMORY_DEVICE_PRIVATE but this might change. + * support transparent huge page with MEMORY_DEVICE_PRIVATE but + * this might change. */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7e08cbf3ba49..7ef849da8278 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_PUBLIC: + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { /* * TODO: Handle HMM pages which may need coordination * with device-side memory. */ goto unlock; - default: - break; } /* diff --git a/mm/memory.c b/mm/memory.c index 53bd59579861..89325f9c6173 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, bool with_public_device) +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; - - /* - * Device public pages are special pages (they are ZONE_DEVICE - * pages but different from persistent memory). They behave - * allmost like normal pages. The difference is that they are - * not on the lru and thus should never be involve with any- - * thing that involve lru manipulation (mlock, numa balancing, - * ...). - * - * This is why we still want to return NULL for such page from - * vm_normal_page() so that we do not have to special case all - * call site of vm_normal_page(). - */ - if (likely(pfn <= highest_memmap_pfn)) { - struct page *page = pfn_to_page(pfn); - - if (is_device_public_page(page)) { - if (with_public_device) - return page; - return NULL; - } - } - if (pte_devmap(pte)) return NULL; @@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, rss[mm_counter(page)]++; } else if (pte_devmap(pte)) { page = pte_page(pte); - - /* - * Cache coherent device memory behave like regular page and - * not like persistent memory page. For more informations see - * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h - */ - if (is_device_public_page(page)) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } } out_set_pte: @@ -1063,7 +1029,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = _vm_normal_page(vma, addr, ptent, true); + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - /* - * For un-addressable device memory we call the pgmap - * fault handler callback. The callback must migrate - * the page back to some CPU accessible page. - */ - ret = device_private_entry_fault(vma, vmf->address, entry, - vmf->flags, vmf->pmd); + vmf->page = device_private_entry_to_page(entry); + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e096c987d261..6166ba5a15f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } + if (is_dev_zone(zone)) + map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fdcb73536319..f48693f75b37 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, out: return page; } +EXPORT_SYMBOL(alloc_pages_vma); /** * alloc_pages_current - Allocate pages. diff --git a/mm/migrate.c b/mm/migrate.c index e9594bc0d406..3445747e229d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_device_private_page(new)) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); - } else if (is_device_public_page(new)) { - pte = pte_mkdevmap(pte); } } @@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * ZONE_DEVICE pages. */ expected_count += is_device_private_page(page); - expected_count += is_device_public_page(page); if (mapping) expected_count += hpage_nr_pages(page) + page_has_private(page); @@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (!PageMappingFlags(page)) page->mapping = NULL; - if (unlikely(is_zone_device_page(newpage))) { - if (is_device_public_page(newpage)) - flush_dcache_page(newpage); - } else + if (likely(!is_zone_device_page(newpage))) flush_dcache_page(newpage); } @@ -2265,7 +2259,7 @@ again: pfn = 0; goto next; } - page = _vm_normal_page(migrate->vma, addr, pte, true); + page = vm_normal_page(migrate->vma, addr, pte); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page) * FIXME proper solution is to rework migration_entry_wait() so * it does not need to take a reference on page. */ - if (is_device_private_page(page)) - return true; - - /* - * Only allow device public page to be migrated and account for - * the extra reference count imply by ZONE_DEVICE pages. - */ - if (!is_device_public_page(page)) - return false; - extra++; + return is_device_private_page(page); } /* For file back page */ @@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); - } else if (is_device_public_page(page)) { - entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - entry = pte_mkdevmap(entry); } } else { entry = mk_pte(page, vma->vm_page_prot); @@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else if (!is_device_public_page(newpage)) { + } else { /* * Other types of ZONE_DEVICE page are not * supported. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dbd0d5cbbcbb..8fd7f45a04eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5925,6 +5925,7 @@ void __ref memmap_init_zone_device(struct zone *zone, { unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -5937,9 +5938,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ - if (pgmap->altmap_valid) { - struct vmem_altmap *altmap = &pgmap->altmap; - + if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } @@ -5959,12 +5958,12 @@ void __ref memmap_init_zone_device(struct zone *zone, __SetPageReserved(page); /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer and hmm_data. It is a bug if a ZONE_DEVICE - * page is ever freed or placed on a driver-private list. + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. */ page->pgmap = pgmap; - page->hmm_data = 0; + page->zone_device_data = NULL; /* * Mark the block movable so that blocks are reserved for diff --git a/mm/swap.c b/mm/swap.c index 7ede3eddc12a..607c48229a1d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr) if (is_huge_zero_page(page)) continue; - /* Device public page can not be huge page */ - if (is_device_public_page(page)) { + if (is_zone_device_page(page)) { if (locked_pgdat) { spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); locked_pgdat = NULL; } - put_devmap_managed_page(page); - continue; + /* + * ZONE_DEVICE pages that return 'false' from + * put_devmap_managed_page() do not require special + * processing, and instead, expect a call to + * put_page_testzero(). + */ + if (put_devmap_managed_page(page)) + continue; } page = compound_head(page); |