diff options
author | Mauro Carvalho Chehab <mchehab@s-opensource.com> | 2016-10-05 16:42:36 -0300 |
---|---|---|
committer | Mauro Carvalho Chehab <mchehab@s-opensource.com> | 2016-10-05 16:43:53 -0300 |
commit | 9fce0c226536fc36c7fb0a80000ca38a995be43e (patch) | |
tree | d1d03d4a831bb143ffd401cefa0f44f636ac0084 /mm | |
parent | 3cc2691227203c00cac1d82d6b0772224d5c87b2 (diff) | |
parent | c8d2bc9bc39ebea8437fd974fdbc21847bb897a3 (diff) |
Merge tag 'v4.8' into patchwork
Linux 4.8
* tag 'v4.8': (1761 commits)
Linux 4.8
ARM: 8618/1: decompressor: reset ttbcr fields to use TTBR0 on ARMv7
MIPS: CM: Fix mips_cm_max_vp_width for non-MT kernels on MT systems
include/linux/property.h: fix typo/compile error
ocfs2: fix deadlock on mmapped page in ocfs2_write_begin_nolock()
mm: workingset: fix crash in shadow node shrinker caused by replace_page_cache_page()
MAINTAINERS: Switch to kernel.org email address for Javi Merino
x86/entry/64: Fix context tracking state warning when load_gs_index fails
x86/boot: Initialize FPU and X86_FEATURE_ALWAYS even if we don't have CPUID
x86/vdso: Fix building on big endian host
x86/boot: Fix another __read_cr4() case on 486
sctp: fix the issue sctp_diag uses lock_sock in rcu_read_lock
sctp: change to check peer prsctp_capable when using prsctp polices
sctp: remove prsctp_param from sctp_chunk
sctp: move sent_count to the memory hole in sctp_chunk
tg3: Avoid NULL pointer dereference in tg3_io_error_detected()
x86/init: Fix cr4_init_shadow() on CR4-less machines
MIPS: Fix detection of unsupported highmem with cache aliases
MIPS: Malta: Fix IOCU disable switch read for MIPS64
MIPS: Fix BUILD_ROLLBACK_PROLOGUE for microMIPS
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/debug.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 114 | ||||
-rw-r--r-- | mm/huge_memory.c | 14 | ||||
-rw-r--r-- | mm/hugetlb.c | 1 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 7 | ||||
-rw-r--r-- | mm/khugepaged.c | 25 | ||||
-rw-r--r-- | mm/ksm.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 113 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 17 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 121 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/slab.c | 30 | ||||
-rw-r--r-- | mm/slub.c | 46 | ||||
-rw-r--r-- | mm/swapfile.c | 1 | ||||
-rw-r--r-- | mm/usercopy.c | 280 | ||||
-rw-r--r-- | mm/vmscan.c | 41 | ||||
-rw-r--r-- | mm/workingset.c | 10 |
25 files changed, 658 insertions, 236 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 78a23c5c302d..be0ee11fa0d9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,7 +262,14 @@ config COMPACTION select MIGRATION depends on MMU help - Allows the compaction of memory for the allocation of huge pages. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration diff --git a/mm/Makefile b/mm/Makefile index fc059666c760..2ca1faf3fa09 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/debug.c b/mm/debug.c index 8865bfb41b0b..74c7cae4f683 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,9 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + int mapcount = PageSlab(page) ? 0 : page_mapcount(page); + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, page_ref_count(page), page_mapcount(page), - page->mapping, page->index); + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); diff --git a/mm/filemap.c b/mm/filemap.c index 8a287dfc5372..2d0986a64f1f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -110,6 +110,62 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, 0, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + + mapping->nrexceptional--; + if (!dax_mapping(mapping)) { + if (shadowp) + *shadowp = p; + if (node) + workingset_node_shadows_dec(node); + } else { + /* DAX can replace empty locked entry with a hole */ + WARN_ON_ONCE(p != + (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK)); + /* DAX accounts exceptional entries as normal pages */ + if (node) + workingset_node_pages_dec(node); + /* Wakeup waiters for exceptional entry lock */ + dax_wake_mapping_entry_waiter(mapping, page->index, + false); + } + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { @@ -561,7 +617,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL); - error = radix_tree_insert(&mapping->page_tree, offset, new); + error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); mapping->nrpages++; @@ -584,62 +640,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->page_tree, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - if (node) - workingset_node_shadows_dec(node); - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, - false); - } - } - radix_tree_replace_slot(slot, page); - mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } - return 0; -} - static int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2373f0a7d340..53ae6d00656a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1078,7 +1078,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { @@ -1116,7 +1116,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); if (flags & FOLL_GET) get_page(page); @@ -1138,9 +1138,6 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - /* A PROT_NONE fault should not end up here */ - BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_same(pmd, *fe->pmd))) goto out_unlock; @@ -1512,7 +1509,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty; + bool young, write, dirty, soft_dirty; unsigned long addr; int i; @@ -1546,6 +1543,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1562,6 +1560,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); } else { entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(entry, vma); @@ -1569,6 +1569,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); } if (dirty) SetPageDirty(page + i); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9aa1b0b38b0..87e11d8ad536 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page) list_del(&page->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; + h->max_huge_pages--; update_and_free_page(h, page); } spin_unlock(&hugetlb_lock); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b6728a33a4ac..baabaad4a4aa 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -217,11 +217,8 @@ void quarantine_reduce(void) new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - if (WARN_ONCE(new_quarantine_size < percpu_quarantines, - "Too little memory, disabling global KASAN quarantine.\n")) - new_quarantine_size = 0; - else - new_quarantine_size -= percpu_quarantines; + new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? + 0 : new_quarantine_size - percpu_quarantines; WRITE_ONCE(quarantine_size, new_quarantine_size); last = global_quarantine.head; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79c52d0061af..728d7790dc2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) * value (scan code). */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap) { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; - vma = find_vma(mm, address); + *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; @@ -881,6 +882,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pmd = pmd, }; + /* we only decide to swapin, if there is enough young ptes */ + if (referenced < HPAGE_PMD_NR/2) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } fe.pte = pte_offset_map(pmd, address); for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; fe.pte++, fe.address += PAGE_SIZE) { @@ -888,17 +894,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, if (!is_swap_pte(pteval)) continue; swapped_in++; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } ret = do_swap_page(&fe, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address)) { + if (hugepage_vma_revalidate(mm, address, &fe.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma, int node, int referenced) { pmd_t *pmd, _pmd; @@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; struct mem_cgroup *memcg; + struct vm_area_struct *vma; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t gfp; @@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm, } down_read(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) { mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); @@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; /* check if the pmd is still valid */ @@ -1202,7 +1203,7 @@ out_unmap: if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node, referenced); + collapse_huge_page(mm, address, hpage, node, referenced); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -283,7 +283,8 @@ static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; - rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | + __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66beca1ad92f..4be518d4e68a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1740,17 +1740,22 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned long flags; bool ret = false; if (nr_pages > CHARGE_BATCH) return ret; - stock = &get_cpu_var(memcg_stock); + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } - put_cpu_var(memcg_stock); + + local_irq_restore(flags); + return ret; } @@ -1771,15 +1776,18 @@ static void drain_stock(struct memcg_stock_pcp *stock) stock->cached = NULL; } -/* - * This must be called under preempt disabled or must be called by - * a thread which is pinned to local cpu. - */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_irq_restore(flags); } /* @@ -1788,14 +1796,19 @@ static void drain_local_stock(struct work_struct *dummy) */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); stock->cached = memcg; } stock->nr_pages += nr_pages; - put_cpu_var(memcg_stock); + + local_irq_restore(flags); } /* @@ -2337,8 +2350,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!ret) + __SetPageKmemcg(page); + } css_put(&memcg->css); return ret; } @@ -2365,6 +2381,11 @@ void memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; + + /* slab pages do not have PageKmemcg flag set */ + if (PageKmemcg(page)) + __ClearPageKmemcg(page); + css_put_many(&memcg->css, nr_pages); } #endif /* !CONFIG_SLOB */ @@ -4069,14 +4090,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4085,6 +4106,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4719,6 +4750,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4726,9 +4759,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -5537,8 +5570,10 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; pgpgout++; - } else + } else { nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } page->mem_cgroup = NULL; } while (next != page_list); @@ -5781,6 +5816,24 @@ static int __init mem_cgroup_init(void) subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_swapout - transfer a memsw charge to swap * @page: page whose memsw charge to transfer @@ -5790,7 +5843,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5805,16 +5858,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5853,11 +5917,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + memcg = mem_cgroup_id_get_online(memcg); + if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) + !page_counter_try_charge(&memcg->swap, 1, &counter)) { + mem_cgroup_id_put(memcg); return -ENOMEM; + } - mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); diff --git a/mm/memory.c b/mm/memory.c index 83be99d9d8a1..793fe0f9841c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,9 +3351,6 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) bool was_writable = pte_write(pte); int flags = 0; - /* A PROT_NONE fault should not end up here */ - BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but @@ -3458,6 +3455,11 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3524,7 +3526,7 @@ static int handle_pte_fault(struct fault_env *fe) if (!pte_present(entry)) return do_swap_page(fe, entry); - if (pte_protnone(entry)) + if (pte_protnone(entry) && vma_is_accessible(fe->vma)) return do_numa_page(fe, entry); fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); @@ -3590,7 +3592,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - if (pmd_protnone(orig_pmd)) + if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&fe, orig_pmd); if ((fe.flags & FAULT_FLAG_WRITE) && diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3894b65b1555..9d29ba0f7192 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_node(nid, zones_size, start_pfn, zholes_size); + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) static void rollback_node_hotadd(int nid, pg_data_t *pgdat) { arch_refresh_nodedata(nid, NULL); + free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); return; } @@ -1553,8 +1555,8 @@ static struct page *new_node_page(struct page *page, unsigned long private, { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); - nodemask_t nmask = node_online_map; - struct page *new_page; + nodemask_t nmask = node_states[N_MEMORY]; + struct page *new_page = NULL; /* * TODO: allocate a destination hugepage from a nearest neighbor node, @@ -1566,11 +1568,13 @@ static struct page *new_node_page(struct page *page, unsigned long private, next_node_in(nid, nmask)); node_clear(nid, nmask); + if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, 0, + if (!nodes_empty(nmask)) + new_page = __alloc_pages_nodemask(gfp_mask, 0, node_zonelist(nid, gfp_mask), &nmask); if (!new_page) new_page = __alloc_pages(gfp_mask, 0, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8c4e38fb5f4..2da72a5b6ecc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2336,6 +2336,23 @@ out: return ret; } +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7d0a275df822..d53a9aa00977 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; - bool ret; + bool ret = true; /* * Skip tasks without mm because it might have passed its exit_mm and diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb975cec3518..a2214c64ed3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1008,10 +1008,8 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) { + if (memcg_kmem_enabled() && PageKmemcg(page)) memcg_kmem_uncharge(page, order); - __ClearPageKmemcg(page); - } if (check_free) bad += free_pages_check(page); if (bad) @@ -3139,54 +3137,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } -static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, - enum compact_priority *compact_priority, - int compaction_retries) -{ - int max_retries = MAX_COMPACT_RETRIES; - - if (!order) - return false; - - /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the - * failure could be caused by insufficient priority - */ - if (compaction_failed(compact_result)) { - if (*compact_priority > MIN_COMPACT_PRIORITY) { - (*compact_priority)--; - return true; - } - return false; - } - - /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. - */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); - - /* - * !costly requests are much more important than __GFP_REPEAT - * costly ones because they are de facto nofail and invoke OOM - * killer to move on while costly can fail and users are ready - * to cope with that. 1/4 retries is rather arbitrary but we - * would need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - if (compaction_retries <= max_retries) - return true; - - return false; -} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, @@ -3197,6 +3147,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } +#endif /* CONFIG_COMPACTION */ + static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, @@ -3223,7 +3175,6 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } return false; } -#endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int @@ -3756,12 +3707,10 @@ no_zone: } out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { - if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { - __free_pages(page, order); - page = NULL; - } else - __SetPageKmemcg(page); + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && + unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + __free_pages(page, order); + page = NULL; } if (kmemcheck_enabled && page) @@ -4064,7 +4013,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; @@ -4411,7 +4360,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, do { zone_type--; zone = pgdat->node_zones + zone_type; - if (populated_zone(zone)) { + if (managed_zone(zone)) { zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); @@ -4649,7 +4598,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) for (j = 0; j < nr_nodes; j++) { node = node_order[j]; z = &NODE_DATA(node)->node_zones[zone_type]; - if (populated_zone(z)) { + if (managed_zone(z)) { zoneref_set_zone(z, &zonelist->_zonerefs[pos++]); check_highest_zone(zone_type); @@ -4761,6 +4710,8 @@ int local_memory_node(int node) } #endif +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void set_zonelist_order(void) @@ -5882,9 +5833,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) - / 100; - pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; zone->zone_pgdat = pgdat; @@ -6805,6 +6753,12 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + return 0; } core_initcall(init_per_zone_wmark_min) @@ -6846,43 +6800,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA +static void setup_min_unmapped_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_unmapped_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * + sysctl_min_unmapped_ratio) / 100; +} + + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct pglist_data *pgdat; - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; + setup_min_unmapped_ratio(); + + return 0; +} + +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + for_each_online_pgdat(pgdat) pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; - return 0; + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct pglist_data *pgdat; - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_online_pgdat(pgdat) - pgdat->min_slab_pages = 0; + setup_min_slab_ratio(); - for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; return 0; } #endif diff --git a/mm/page_io.c b/mm/page_io.c index 16bd82fad38c..eafe5ddc2b54 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,6 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -337,6 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -386,6 +388,7 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; + BUG_ON(!PageSwapCache(page)); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); diff --git a/mm/readahead.c b/mm/readahead.c index 65ec288dc057..c8a955b1297e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/blkdev.h> @@ -544,6 +545,14 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops) return -EINVAL; + /* + * Readahead doesn't make sense for DAX inodes, but we don't want it + * to report a failure either. Instead, we just return success and + * don't do any work. + */ + if (dax_mapping(mapping)) + return 0; + return force_page_cache_readahead(mapping, filp, index, nr); } diff --git a/mm/rmap.c b/mm/rmap.c index 709bc83703b1..1ef36404e7b2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1284,8 +1284,9 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(!PageSwapBacked(page), page); __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { - if (PageTransCompound(page)) { - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (PageTransCompound(page) && page_mapping(page)) { + VM_WARN_ON_ONCE(!PageLocked(page)); + SetPageDoubleMap(compound_head(page)); if (PageMlocked(page)) clear_page_mlock(compound_head(page)); @@ -1303,7 +1304,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) { int i, nr = 1; - VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(compound && !PageHead(page), page); lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ diff --git a/mm/shmem.c b/mm/shmem.c index 7f7748a0f9e1..971fc83e6402 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -270,7 +270,7 @@ bool shmem_charge(struct inode *inode, long pages) info->alloced -= pages; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - + shmem_unacct_blocks(info->flags, pages); return false; } percpu_counter_add(&sbinfo->used_blocks, pages); @@ -291,6 +291,7 @@ void shmem_uncharge(struct inode *inode, long pages) if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); } /* @@ -1980,7 +1981,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, return addr; sb = shm_mnt->mnt_sb; } - if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) return addr; } @@ -3975,7 +3976,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4006,7 +4009,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ #else /* !CONFIG_SHMEM */ diff --git a/mm/slab.c b/mm/slab.c index 261147ba156f..b67271024135 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4441,6 +4441,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object diff --git a/mm/slub.c b/mm/slub.c index 850737bdfbd8..9adae58462f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3629,6 +3629,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { + LIST_HEAD(discard); struct page *page, *h; BUG_ON(irqs_disabled()); @@ -3636,13 +3637,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); - discard_slab(s, page); + list_add(&page->lru, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(page, h, &discard, lru) + discard_slab(s, page); } /* @@ -3764,6 +3768,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + static size_t __ksize(const void *object) { struct page *page; diff --git a/mm/swapfile.c b/mm/swapfile.c index 78cfa292a29a..2657accc6e2b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2724,7 +2724,6 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - BUG_ON(!PageSwapCache(page)); return swap_info[swp_type(swap)]; } diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000000..3c8da0af9695 --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,280 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/sections.h> + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high <= low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return "<kernel text>"; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return "<linear kernel text>"; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if ((unsigned long)ptr + n < (unsigned long)ptr) + return "<wrapped address>"; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return "<null>"; + + return NULL; +} + +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) +{ +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN + const void *end = ptr + n - 1; + struct page *endpage; + bool is_reserved, is_cma; + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return "<rodata>"; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if fully inside the same compound (__GFP_COMP) page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + return "<spans multiple pages>"; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + return "<spans Reserved and non-Reserved pages>"; + if (is_cma && !is_migrate_cma_page(page)) + return "<spans CMA and non-CMA pages>"; + } +#endif + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + * + * We also need to check for module addresses explicitly since we + * may copy static data from modules to userspace + */ + if (is_vmalloc_or_module_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = "<process stack>"; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 374d95d04178..0fe8b7113868 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1665,7 +1665,7 @@ static bool inactive_reclaimable_pages(struct lruvec *lruvec, for (zid = sc->reclaim_idx; zid >= 0; zid--) { zone = &pgdat->node_zones[zid]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + @@ -2036,7 +2036,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, struct zone *zone = &pgdat->node_zones[zid]; unsigned long inactive_zone, active_zone; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; inactive_zone = zone_page_state(zone, @@ -2171,7 +2171,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = &pgdat->node_zones[z]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; total_high_wmark += high_wmark_pages(zone); @@ -2303,23 +2303,6 @@ out: } } -#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -static void init_tlb_ubc(void) -{ - /* - * This deliberately does not clear the cpumask as it's expensive - * and unnecessary. If there happens to be data in there then the - * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and - * then will be cleared. - */ - current->tlb_ubc.flush_required = false; -} -#else -static inline void init_tlb_ubc(void) -{ -} -#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ - /* * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ @@ -2355,8 +2338,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); - init_tlb_ubc(); - blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2510,7 +2491,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { @@ -2840,7 +2821,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone) || + if (!managed_zone(zone) || pgdat_reclaimable_pages(pgdat) == 0) continue; @@ -3141,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (!zone_balanced(zone, order, classzone_idx)) @@ -3169,7 +3150,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, sc->nr_to_reclaim = 0; for (z = 0; z <= sc->reclaim_idx; z++) { zone = pgdat->node_zones + z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); @@ -3242,7 +3223,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; sc.reclaim_idx = i; @@ -3262,7 +3243,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) */ for (i = classzone_idx; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_balanced(zone, sc.order, classzone_idx)) @@ -3508,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) pg_data_t *pgdat; int z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) @@ -3522,7 +3503,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_balanced(zone, order, classzone_idx)) diff --git a/mm/workingset.c b/mm/workingset.c index 69551cfae97b..617475f529f4 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -418,21 +418,19 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - - BUG_ON(!node->count); - BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + BUG_ON(!workingset_node_shadows(node)); + BUG_ON(workingset_node_pages(node)); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); node->slots[i] = NULL; - BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + workingset_node_shadows_dec(node); BUG_ON(!mapping->nrexceptional); mapping->nrexceptional--; } } - BUG_ON(node->count); + BUG_ON(workingset_node_shadows(node)); inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); if (!__radix_tree_delete_node(&mapping->page_tree, node)) BUG(); |