summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/debug.c6
-rw-r--r--mm/early_ioremap.c28
-rw-r--r--mm/filemap.c22
-rw-r--r--mm/huge_memory.c45
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/kasan.c4
-rw-r--r--mm/kasan/report.c1
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/madvise.c9
-rw-r--r--mm/memblock.c38
-rw-r--r--mm/memcontrol.c43
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c105
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/mmu_notifier.c14
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/nobootmem.c16
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_alloc.c95
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/rmap.c71
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c3
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c13
-rw-r--r--mm/zsmalloc.c1
34 files changed, 444 insertions, 195 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 9075aa54e955..b06d9fe23a28 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
if (!page)
return NULL;
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 595b757bef72..c03ccbc405a0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
char name[16];
int u32s;
- sprintf(name, "cma-%s", cma->name);
+ scnprintf(name, sizeof(name), "cma-%s", cma->name);
tmp = debugfs_create_dir(name, cma_debugfs_root);
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..5715448ab0b5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
-#endif
"def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
@@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
- mm->tlb_flush_pending,
-#endif
+ atomic_read(&mm->tlb_flush_pending),
mm->def_flags, &mm->def_flags
);
}
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 6d5717bd7197..b1dd4a948fc0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
static int after_paging_init __initdata;
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ return prot;
+}
+
void __init __weak early_ioremap_shutdown(void)
{
}
@@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
void __init *
early_memremap(resource_size_t phys_addr, unsigned long size)
{
- return (__force void *)__early_ioremap(phys_addr, size,
- FIXMAP_PAGE_NORMAL);
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
}
#ifdef FIXMAP_PAGE_RO
void __init *
early_memremap_ro(resource_size_t phys_addr, unsigned long size)
{
- return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_RO);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#endif
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ __pgprot(prot_val));
}
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 85dfe3bee324..1e01cb6e5173 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -883,6 +883,7 @@ void __init pagecache_init(void)
page_writeback_init();
}
+/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
struct wait_page_key {
struct page *page;
int bit_nr;
@@ -907,8 +908,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
if (wait_page->bit_nr != key->bit_nr)
return 0;
+
+ /* Stop walking if it's locked */
if (test_bit(key->bit_nr, &key->page->flags))
- return 0;
+ return -1;
return autoremove_wake_function(wait, mode, sync, key);
}
@@ -962,6 +965,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
int ret = 0;
init_wait(wait);
+ wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
@@ -970,10 +974,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
spin_lock_irq(&q->lock);
if (likely(list_empty(&wait->entry))) {
- if (lock)
- __add_wait_queue_entry_tail_exclusive(q, wait);
- else
- __add_wait_queue(q, wait);
+ __add_wait_queue_entry_tail(q, wait);
SetPageWaiters(page);
}
@@ -983,10 +984,6 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (likely(test_bit(bit_nr, &page->flags))) {
io_schedule();
- if (unlikely(signal_pending_state(state, current))) {
- ret = -EINTR;
- break;
- }
}
if (lock) {
@@ -996,6 +993,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (!test_bit(bit_nr, &page->flags))
break;
}
+
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
}
finish_wait(q, wait);
@@ -1037,7 +1039,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, waiter);
+ __add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86975dec0ba1..3644ff918434 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
+#include <linux/oom.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ int ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg, true);
- put_page(page);
- return VM_FAULT_OOM;
+ ret = VM_FAULT_OOM;
+ goto release;
}
clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
- spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
- put_page(page);
- pte_free(vma->vm_mm, pgtable);
+ goto unlock_release;
} else {
pmd_t entry;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
}
return 0;
+unlock_release:
+ spin_unlock(vmf->ptl);
+release:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ mem_cgroup_cancel_charge(page, memcg, true);
+ put_page(page);
+ return ret;
+
}
/*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = 0;
set = false;
if (pmd_none(*vmf->pmd)) {
- if (userfaultfd_missing(vma)) {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret) {
+ spin_unlock(vmf->ptl);
+ } else if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
@@ -1496,10 +1510,25 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
}
/*
+ * Since we took the NUMA fault, we must have observed the !accessible
+ * bit. Make sure all other CPUs agree with that, to avoid them
+ * modifying the page we're about to migrate.
+ *
+ * Must be done under PTL such that we'll observe the relevant
+ * inc_tlb_flush_pending().
+ *
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
+ if (mm_tlb_flush_pending(vma->vm_mm))
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+ /*
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
spin_unlock(vmf->ptl);
+
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..31e207cb399b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4062,9 +4062,9 @@ out:
return ret;
out_release_unlock:
spin_unlock(ptl);
-out_release_nounlock:
if (vm_shared)
unlock_page(page);
+out_release_nounlock:
put_page(page);
goto out;
}
@@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
+ int err = -EFAULT;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
- int err = vm_fault_to_errno(ret, flags);
-
- if (err)
- return err;
-
+ err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
@@ -4213,7 +4210,7 @@ same_page:
*/
*position = vaddr;
- return i ? i : -EFAULT;
+ return i ? i : err;
}
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
diff --git a/mm/internal.h b/mm/internal.h
index 24d88f084705..4ef49fc55e58 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
static inline void try_to_unmap_flush_dirty(void)
{
}
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
check_memory_region_inline(addr, size, write, ret_ip);
}
-void kasan_check_read(const void *p, unsigned int size)
+void kasan_check_read(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(kasan_check_read);
-void kasan_check_write(const void *p, unsigned int size)
+void kasan_check_write(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, true, _RET_IP_);
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 04bb1d3eb9ec..6bcfb01ba038 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
disable_trace_on_warning();
info.access_addr = (void *)addr;
+ info.first_bad_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
diff --git a/mm/ksm.c b/mm/ksm.c
index 4dc92f138786..db20f8436bc3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
- (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) {
+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
+ mm_tlb_flush_pending(mm)) {
pte_t entry;
swapped = PageSwapCache(page);
diff --git a/mm/madvise.c b/mm/madvise.c
index 9976852f1e1c..4d7d1e5ddba9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
@@ -367,8 +368,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
- put_page(page);
unlock_page(page);
+ put_page(page);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
@@ -612,6 +613,7 @@ static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
struct page *page;
+ struct zone *zone;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -645,6 +647,11 @@ static int madvise_inject_error(int behavior,
if (ret)
return ret;
}
+
+ /* Ensure that all poisoned pages are removed from per-cpu lists */
+ for_each_populated_zone(zone)
+ drain_all_pages(zone);
+
return 0;
}
#endif
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..91205780e6b1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
- phys_addr_t *addr)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- *addr = __pa(memblock.reserved.regions);
-
- return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
- phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
{
- if (memblock.memory.regions == memblock_memory_init_regions)
- return 0;
+ phys_addr_t addr, size;
- *addr = __pa(memblock.memory.regions);
+ if (memblock.reserved.regions != memblock_reserved_init_regions) {
+ addr = __pa(memblock.reserved.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+ __memblock_free_late(addr, size);
+ }
- return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+ if (memblock.memory.regions != memblock_memory_init_regions) {
+ addr = __pa(memblock.memory.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+ __memblock_free_late(addr, size);
+ }
}
-
#endif
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..e09741af816f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1611,9 +1611,13 @@ cleanup:
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
*/
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- */
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page itself from being freed. E.g. writeback
+ * doesn't hold a page reference and relies on PG_writeback to
+ * keep off truncation, migration and so forth.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
- return;
+ return NULL;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return;
+ return NULL;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ again:
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return;
+ return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
*/
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
-
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
rcu_read_unlock();
}
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+ __unlock_page_memcg(page->mem_cgroup);
+}
EXPORT_SYMBOL(unlock_page_memcg);
/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1cd3b3569af8..88366626c0b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
+ arch_unmap_kpfn(pfn);
+
orig_head = hpage = compound_head(p);
num_poisoned_pages_inc();
diff --git a/mm/memory.c b/mm/memory.c
index 0e517be91a89..56e48e4593cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
+#include <linux/oom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -215,12 +216,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
@@ -275,10 +272,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
struct mmu_gather_batch *batch, *next;
+ if (force)
+ __tlb_adjust_range(tlb, start, end - start);
+
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -398,6 +399,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ arch_tlb_gather_mmu(tlb, mm, start, end);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, a thread has stable TLB entry can fail to flush
+ * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+ * forcefully if we detect parallel PTE batching threads.
+ */
+ bool force = mm_tlb_flush_nested(tlb->mm);
+
+ arch_tlb_finish_mmu(tlb, start, end, force);
+ dec_tlb_flush_pending(tlb->mm);
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1197,6 +1226,7 @@ again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
@@ -2864,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
+ int ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
@@ -2896,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte))
goto unlock;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2930,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!pte_none(*vmf->pte))
goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2949,7 +2987,7 @@ setpte:
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
+ return ret;
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -3223,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
int finish_fault(struct vm_fault *vmf)
{
struct page *page;
- int ret;
+ int ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3231,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
page = vmf->cow_page;
else
page = vmf->page;
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+ /*
+ * check even for read faults because we might have lost our CoWed
+ * page
+ */
+ if (!(vmf->vma->vm_flags & VM_SHARED))
+ ret = check_stable_address_space(vmf->vma->vm_mm);
+ if (!ret)
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3871,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
- /*
- * This mm has been already reaped by the oom reaper and so the
- * refault cannot be trusted in general. Anonymous refaults would
- * lose data and give a zero page instead e.g. This is especially
- * problem for use_mm() because regular tasks will just die and
- * the corrupted data will not be visible anywhere while kthread
- * will outlive the oom victim and potentially propagate the date
- * further.
- */
- if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
- && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
- ret = VM_FAULT_SIGBUS;
-
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -3975,7 +4008,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ unsigned long *start, unsigned long *end,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4002,17 +4036,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
+ if (start && end) {
+ *start = address & PMD_MASK;
+ *end = *start + PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
+ if (start && end) {
+ *start = address & PAGE_MASK;
+ *end = *start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4020,6 +4066,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
out:
return -EINVAL;
}
@@ -4031,20 +4079,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ unsigned long *start, unsigned long *end,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, start, end,
+ ptepp, pmdpp, ptlp)));
return res;
}
EXPORT_SYMBOL(follow_pte_pmd);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..618ab125228b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
- if (vma) {
- up_read(&current->mm->mmap_sem);
- vma = NULL;
- }
-
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 627671551873..e84eeb4e4356 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -41,6 +41,7 @@
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
@@ -1652,7 +1653,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
const int __user *, nodes,
int __user *, status, int, flags)
{
- const struct cred *cred = current_cred(), *tcred;
struct task_struct *task;
struct mm_struct *mm;
int err;
@@ -1676,14 +1676,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
/*
* Check if this process has the right to modify the specified
- * process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
- * userid as the target process.
+ * process. Use the regular "ptrace_may_access()" checks.
*/
- tcred = __task_cred(task);
- if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
- !capable(CAP_SYS_NICE)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out;
@@ -1937,12 +1932,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
- /*
- * We are not sure a pending tlb flush here is for a huge page
- * mapping or not. Hence use the tlb range variant
- */
- if (mm_tlb_flush_pending(mm))
- flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 54ca54562928..314285284e6e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
- struct mmu_notifier *mn;
- int id;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_page)
- mn->ops->invalidate_page(mn, mm, address);
- }
- srcu_read_unlock(&srcu, id);
-}
-
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 1a8c9ca83e48..bd0f409922cb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -243,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
+ inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -255,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
+ dec_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index cd8a1b199ef9..3f23715d3c69 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
@@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
struct mm_struct *mm = current->mm;
@@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
- ret = do_munmap(mm, new_addr, new_len, NULL);
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
@@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long charged = 0;
bool locked = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
@@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap);
+ &locked, &uf, &uf_unmap_early, &uf_unmap);
goto out;
}
@@ -621,6 +624,7 @@ out:
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
NULL)
count += __free_memory_core(start, end);
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
- {
- phys_addr_t size;
-
- /* Free memblock.reserved array if it was allocated */
- size = get_allocated_memblock_reserved_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
-
- /* Free memblock.memory array if it was allocated */
- size = get_allocated_memblock_memory_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
- }
-#endif
-
return count;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..bf050ab025b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
int ret;
- lock_page_memcg(page);
+ memcg = lock_page_memcg(page);
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
+ /*
+ * NOTE: Page might be free now! Writeback doesn't hold a page
+ * reference on its own, it relies on truncation to wait for
+ * the clearing of PG_writeback. The below can only access
+ * page state that is static across allocation cycles.
+ */
if (ret) {
- dec_lruvec_page_state(page, NR_WRITEBACK);
+ dec_lruvec_state(lruvec, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- unlock_page_memcg(page);
+ __unlock_page_memcg(memcg);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d30e914afb6..9327a940e373 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,8 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -1584,6 +1586,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ /* Discard memblock private memory */
+ memblock_discard();
+#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -2531,9 +2537,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2548,6 +2559,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2561,8 +2577,13 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -3271,10 +3292,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3490,6 +3514,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
#endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3504,7 +3569,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3512,7 +3577,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -4041,7 +4106,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- lockdep_trace_alloc(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -4458,8 +4524,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4602,8 +4669,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
@@ -4891,9 +4958,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
+ mem_hotplug_begin();
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
+ mem_hotplug_done();
}
}
out:
@@ -7666,7 +7735,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;
diff --git a/mm/page_io.c b/mm/page_io.c
index b6c4ac388209..5f61b54ee1f3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
+#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -136,6 +137,7 @@ out:
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
+ put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
goto out;
}
bdev = bio->bi_bdev;
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
diff --git a/mm/rmap.c b/mm/rmap.c
index ced14f1af6dc..c570f82e6827 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -605,6 +605,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
tlb_ubc->flush_required = true;
/*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
+ /*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
* before the page is queued for IO.
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (mm->tlb_flush_batched) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
@@ -851,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+ unsigned long start = address, end;
int *cleaned = arg;
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free from this function.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
+ unsigned long cstart, cend;
int ret = 0;
- address = pvmw.address;
+
+ cstart = address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -868,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
+ cend = cstart + PAGE_SIZE;
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -882,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
+ cstart &= PMD_MASK;
+ cend = cstart + PMD_SIZE;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -890,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
if (ret) {
- mmu_notifier_invalidate_page(vma->vm_mm, address);
+ mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
(*cleaned)++;
}
}
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return true;
}
@@ -1288,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
+ unsigned long start = address, end;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1299,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
flags & TTU_MIGRATION, page);
}
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free in this function as call of try_to_unmap()
+ * must hold a reference on the page.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
/*
* If the page is mlock()d, we cannot swap it out.
@@ -1409,6 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
WARN_ON_ONCE(1);
ret = false;
+ /* We have to invalidate as we cleared the pte */
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1454,8 +1515,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
- mmu_notifier_invalidate_page(mm, address);
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
}
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index b0aa6075d164..fbcb3c96a186 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1022,7 +1022,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
@@ -1817,7 +1821,11 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
@@ -3959,7 +3967,7 @@ int __init shmem_init(void)
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
- if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
+ if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
shmem_huge = 0; /* just in case it was patched */
@@ -4020,7 +4028,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
return -EINVAL;
shmem_huge = huge;
- if (shmem_huge < SHMEM_HUGE_DENY)
+ if (shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
return count;
}
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/random.h>
+#include <linux/sched/mm.h>
/*
* State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
gfp_t flags)
{
flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
+
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
+
might_sleep_if(gfpflags_allow_blocking(flags));
if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp &= gfp_allowed_mask;
- lockdep_trace_alloc(gfp);
+ fs_reclaim_acquire(gfp);
+ fs_reclaim_release(gfp);
if (size < PAGE_SIZE - align) {
if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/slub.c b/mm/slub.c
index 1d3f9835f4ea..e8b4e31162ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
* A cache is never shut down before deactivation is
* complete, so no need to worry about synchronization.
*/
- return;
+ goto out;
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
+out:
kobject_put(&s->kobj);
}
diff --git a/mm/util.c b/mm/util.c
index 7b07ec852e01..9ecddf568fe3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
- free += global_page_state(NR_SLAB_RECLAIMABLE);
+ free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c86c4d..a47e3894c775 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
+ const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+ const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
+ 0 :
+ __GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
+ pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
+ page = alloc_page(alloc_mask|highmem_mask);
else
- page = alloc_pages_node(node, alloc_mask, 0);
+ page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask))
+ if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..f957afe900ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3525,8 +3525,6 @@ static int kswapd(void *p)
};
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- lockdep_set_current_reclaim_state(GFP_KERNEL);
-
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
@@ -3585,14 +3583,15 @@ kswapd_try_sleep:
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
alloc_order);
+ fs_reclaim_acquire(GFP_KERNEL);
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ fs_reclaim_release(GFP_KERNEL);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
return 0;
}
@@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
unsigned int noreclaim_flag;
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(sc.gfp_mask);
+ fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(sc.gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
@@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- lockdep_set_current_reclaim_state(sc.gfp_mask);
+ fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
+ fs_reclaim_release(gfp_mask);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
- lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 013eea76685e..308acb9d814b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
}
destroy_cache(pool);
- kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
}