From 4b094b7851bf4bf551ad456195d3f26e1c03bd74 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:33:55 -0800 Subject: mm/page_alloc.c: initialize memmap of unavailable memory directly Let's make sure that all memory holes are actually marked PageReserved(), that page_to_pfn() produces reliable results, and that these pages are not detected as "mmap" pages due to the mapcount. E.g., booting a x86-64 QEMU guest with 4160 MB: [ 0.010585] Early memory node ranges [ 0.010586] node 0: [mem 0x0000000000001000-0x000000000009efff] [ 0.010588] node 0: [mem 0x0000000000100000-0x00000000bffdefff] [ 0.010589] node 0: [mem 0x0000000100000000-0x0000000143ffffff] max_pfn is 0x144000. Before this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000800 16384 64 ___________M_______________________________ mmap total 16384 64 After this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000100000000 16384 64 ___________________________r_______________ reserved total 16384 64 IOW, especially the unavailable physical memory ("memory hole") in the last section would not get properly marked PageReserved() and is indicated to be "mmap" memory. Drop the trace of that function from include/linux/mm.h - nobody else needs it, and rename it accordingly. Note: The fake zone/node might not be covered by the zone/node span. This is not an urgent issue (for now, we had the same node/zone due to the zeroing). We'll need a clean way to mark memory holes (e.g., using a page type PageHole() if possible or a fake ZONE_INVALID) and eventually stop marking these memory holes PageReserved(). Link: http://lkml.kernel.org/r/20191211163201.17179-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Bob Picco Cc: Daniel Jordan Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Stephen Rothwell Cc: Steven Sistare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a044ed6981..080f8ac8bfb7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2182,12 +2182,6 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) -void zero_resv_unavail(void); -#else -static inline void zero_resv_unavail(void) {} -#endif - extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context, struct vmem_altmap *); -- cgit v1.2.3 From 4c6058814ec4460c25111e29452ef596acdcd61b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:02 -0800 Subject: mm: factor out next_present_section_nr() Let's move it to the header and use the shorter variant from mm/page_alloc.c (the original one will also check "__highest_present_section_nr + 1", which is not necessary). While at it, make the section_nr in next_pfn() const. In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once we exceed __highest_present_section_nr, which doesn't make a difference in the caller as it is big enough (>= all sane end_pfn). Link: http://lkml.kernel.org/r/20200113144035.10848-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Baoquan He Cc: Dan Williams Cc: "Jin, Zhi" Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c2bc309d1634..462f6873905a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn) return present_section(__nr_to_section(pfn_to_section_nr(pfn))); } +static inline unsigned long next_present_section_nr(unsigned long section_nr) +{ + while (++section_nr <= __highest_present_section_nr) { + if (present_section_nr(section_nr)) + return section_nr; + } + + return -1; +} + /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate -- cgit v1.2.3 From 92917998849eea951707c8fea2dc3007bb2ad2cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:26 -0800 Subject: mm/memory_hotplug: drop valid_start/valid_end from test_pages_in_a_zone() The callers are only interested in the actual zone, they don't care about boundaries. Return the zone instead to simplify. Link: http://lkml.kernel.org/r/20200110183308.11849-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ffa6ad12d84a..f4d59155f3d4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -96,8 +96,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); -extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end); +extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn); extern unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From 1c948715a159d0d02c1e1c9228327ba3c408795c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:34:58 -0800 Subject: mm: remove __krealloc Since 5.5-rc1 the last user of this function is gone, so remove the functionality. See commit 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately") for details. Link: http://lkml.kernel.org/r/20191212223442.22141-1-fw@strlen.de Signed-off-by: Florian Westphal Acked-by: Andrew Morton Acked-by: David Rientjes Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 877a95c6a2d2..03a389358562 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,7 +184,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); /* * Common kmalloc functions provided by all allocators */ -void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kzfree(const void *); -- cgit v1.2.3 From 93fab1b22ef7c4abcbc760ce4432762b02e7f3d1 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:01 -0800 Subject: mm: add generic p?d_leaf() macros Patch series "Generic page walk and ptdump", v17. Many architectures current have a debugfs file for dumping the kernel page tables. Currently each architecture has to implement custom functions for this because the details of walking the page tables used by the kernel are different between architectures. This series extends the capabilities of walk_page_range() so that it can deal with the page tables of the kernel (which have no VMAs and can contain larger huge pages than exist for user space). A generic PTDUMP implementation is the implemented making use of the new functionality of walk_page_range() and finally arm64 and x86 are switch to using it, removing the custom table walkers. To enable a generic page table walker to walk the unusual mappings of the kernel we need to implement a set of functions which let us know when the walker has reached the leaf entry. After a suggestion from Will Deacon I've chosen the name p?d_leaf() as this (hopefully) describes the purpose (and is a new name so has no historic baggage). Some architectures have p?d_large macros but this is easily confused with "large pages". This series ends with a generic PTDUMP implemention for arm64 and x86. Mostly this is a clean up and there should be very little functional change. The exceptions are: * arm64 PTDUMP debugfs now displays pages which aren't present (patch 22). * arm64 has the ability to efficiently process KASAN pages (which previously only x86 implemented). This means that the combination of KASAN and DEBUG_WX is now useable. This patch (of 23): Exposing the pud/pgd levels of the page tables to walk_page_range() means we may come across the exotic large mappings that come with large areas of contiguous memory (such as the kernel's linear map). For architectures that don't provide all p?d_leaf() macros, provide generic do nothing default that are suitable where there cannot be leaf pages at that level. Futher patches will add implementations for individual architectures. The name p?d_leaf() is chosen to minimize the confusion with existing uses of "large" pages and "huge" pages which do not necessary mean that the entry is a leaf (for example it may be a set of contiguous entries that only take 1 TLB slot). For the purpose of walking the page tables we don't need to know how it will be represented in the TLB, but we do need to know for sure if it is a leaf of the tree. Link: http://lkml.kernel.org/r/20191218162402.45610-2-steven.price@arm.com Signed-off-by: Steven Price Acked-by: Mark Rutland Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Ingo Molnar Cc: James Morse Cc: Jerome Glisse Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: "H. Peter Anvin" Cc: "Liang, Kan" Cc: Albert Ou Cc: Alexandre Ghiti Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: David S. Miller Cc: Heiko Carstens Cc: James Hogan Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Ralf Baechle Cc: Russell King Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 798ea36a0549..e2e2bef07dd2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1238,4 +1238,24 @@ static inline bool arch_has_pfn_modify_check(void) #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif +/* + * p?d_leaf() - true if this entry is a final mapping to a physical address. + * This differs from p?d_huge() by the fact that they are always available (if + * the architecture supports large pages at the appropriate level) even + * if CONFIG_HUGETLB_PAGE is not defined. + * Only meaningful when called on a valid entry. + */ +#ifndef pgd_leaf +#define pgd_leaf(x) 0 +#endif +#ifndef p4d_leaf +#define p4d_leaf(x) 0 +#endif +#ifndef pud_leaf +#define pud_leaf(x) 0 +#endif +#ifndef pmd_leaf +#define pmd_leaf(x) 0 +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit v1.2.3 From 3afc423632a194d7d6afef34e4bb98f804cd071d Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:45 -0800 Subject: mm: pagewalk: add p4d_entry() and pgd_entry() pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410 ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were no users. We're about to add users so reintroduce them, along with p4d_entry() as we now have 5 levels of tables. Note that commit a00cc7d9dd93d66a ("mm, x86: add support for PUD-sized transparent hugepages") already re-added pud_entry() but with different semantics to the other callbacks. This commit reverts the semantics back to match the other callbacks. To support hmm.c which now uses the new semantics of pud_entry() a new member ('action') of struct mm_walk is added which allows the callbacks to either descend (ACTION_SUBTREE, the default), skip (ACTION_CONTINUE) or repeat the callback (ACTION_AGAIN). hmm.c is then updated to call pud_trans_huge_lock() itself and make use of the splitting/retry logic of the core code. After this change pud_entry() is called for all entries, not just transparent huge pages. [arnd@arndb.de: fix unused variable warning] Link: http://lkml.kernel.org/r/20200107204607.1533842-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20191218162402.45610-12-steven.price@arm.com Signed-off-by: Steven Price Signed-off-by: Arnd Bergmann Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 6ec82e92c87f..aa6a0b63964e 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -8,15 +8,15 @@ struct mm_walk; /** * mm_walk_ops - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * @pgd_entry: if set, called for each non-empty PGD (top-level) entry + * @p4d_entry: if set, called for each non-empty P4D entry + * @pud_entry: if set, called for each non-empty PUD entry + * @pmd_entry: if set, called for each non-empty PMD entry * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_entry: if set, called for each non-empty PTE (lowest-level) + * entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether @@ -27,8 +27,15 @@ struct mm_walk; * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. + * + * p?d_entry callbacks are called even if those levels are folded on a + * particular architecture/configuration. */ struct mm_walk_ops { + int (*pgd_entry)(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*p4d_entry)(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk); int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, @@ -47,11 +54,25 @@ struct mm_walk_ops { void (*post_vma)(struct mm_walk *walk); }; +/* + * Action for pud_entry / pmd_entry callbacks. + * ACTION_SUBTREE is the default + */ +enum page_walk_action { + /* Descend to next level, splitting huge pages if needed and possible */ + ACTION_SUBTREE = 0, + /* Continue to next entry at this level (ignoring any subtree) */ + ACTION_CONTINUE = 1, + /* Call again for this entry */ + ACTION_AGAIN = 2 +}; + /** * mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) + * @action: next action to perform (see enum page_walk_action) * @private: private data for callbacks' usage * * (see the comment on walk_page_range() for more details) @@ -60,6 +81,7 @@ struct mm_walk { const struct mm_walk_ops *ops; struct mm_struct *mm; struct vm_area_struct *vma; + enum page_walk_action action; void *private; }; -- cgit v1.2.3 From 488ae6a2b933cb538b5d91b1c0a3420188d28771 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:50 -0800 Subject: mm: pagewalk: allow walking without vma Since 48684a65b4e3: "mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)", page_table_walk() will report any kernel area as a hole, because it lacks a vma. This means each arch has re-implemented page table walking when needed, for example in the per-arch ptdump walker. Remove the requirement to have a vma in the generic code and add a new function walk_page_range_novma() which ignores the VMAs and simply walks the page tables. Link: http://lkml.kernel.org/r/20191218162402.45610-13-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index aa6a0b63964e..d5d07f7a9c14 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -73,6 +73,7 @@ enum page_walk_action { * @mm: mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) * @action: next action to perform (see enum page_walk_action) + * @no_vma: walk ignoring vmas (vma will always be NULL) * @private: private data for callbacks' usage * * (see the comment on walk_page_range() for more details) @@ -82,12 +83,16 @@ struct mm_walk { struct mm_struct *mm; struct vm_area_struct *vma; enum page_walk_action action; + bool no_vma; void *private; }; int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_novma(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, -- cgit v1.2.3 From b7a16c7ad790d0ecb44dcb08a6a75d0d0455ab5f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:03 -0800 Subject: mm: pagewalk: add 'depth' parameter to pte_hole The pte_hole() callback is called at multiple levels of the page tables. Code dumping the kernel page tables needs to know what at what depth the missing entry is. Add this is an extra parameter to pte_hole(). When the depth isn't know (e.g. processing a vma) then -1 is passed. The depth that is reported is the actual level where the entry is missing (ignoring any folding that is in place), i.e. any levels where PTRS_PER_P?D is set to 1 are ignored. Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their natural numbers as levels 2/3/4. Link: http://lkml.kernel.org/r/20191218162402.45610-16-steven.price@arm.com Signed-off-by: Steven Price Tested-by: Zong Li Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index d5d07f7a9c14..745a654c6ea7 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -17,7 +17,10 @@ struct mm_walk; * split_huge_page() instead of handling it explicitly. * @pte_entry: if set, called for each non-empty PTE (lowest-level) * entry - * @pte_hole: if set, called for each hole at all levels + * @pte_hole: if set, called for each hole at all levels, + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD + * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal + * to 1) are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means @@ -43,7 +46,7 @@ struct mm_walk_ops { int (*pte_entry)(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); + int depth, struct mm_walk *walk); int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk); -- cgit v1.2.3 From 30d621f6723b1c98a142861f7a52849d286bc7fa Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:20 -0800 Subject: mm: add generic ptdump Add a generic version of page table dumping that architectures can opt-in to. Link: http://lkml.kernel.org/r/20191218162402.45610-20-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptdump.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/linux/ptdump.h (limited to 'include') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h new file mode 100644 index 000000000000..a0fb8dd2be97 --- /dev/null +++ b/include/linux/ptdump.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_PTDUMP_H +#define _LINUX_PTDUMP_H + +#include + +struct ptdump_range { + unsigned long start; + unsigned long end; +}; + +struct ptdump_state { + void (*note_page)(struct ptdump_state *st, unsigned long addr, + int level, unsigned long val); + const struct ptdump_range *range; +}; + +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm); + +#endif /* _LINUX_PTDUMP_H */ -- cgit v1.2.3 From f8f0d0b6fa203bfa363d30f34f6fecce9e5cc2f7 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:38 -0800 Subject: mm: ptdump: reduce level numbers by 1 in note_page() Rather than having to increment the 'depth' number by 1 in ptdump_hole(), let's change the meaning of 'level' in note_page() since that makes the code simplier. Note that for x86, the level numbers were previously increased by 1 in commit 45dcd2091363 ("x86/mm/dump_pagetables: Fix printout of p4d level") and the comment "Bit 7 has a different meaning" was not updated, so this change also makes the code match the comment again. Link: http://lkml.kernel.org/r/20191218162402.45610-24-steven.price@arm.com Signed-off-by: Steven Price Reviewed-by: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptdump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a0fb8dd2be97..b28f3f2acf90 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -11,6 +11,7 @@ struct ptdump_range { }; struct ptdump_state { + /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, int level, unsigned long val); const struct ptdump_range *range; -- cgit v1.2.3 From e47690d756a760579141560ded06ec1020dd85e8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:42 -0800 Subject: x86: mm: avoid allocating struct mm_struct on the stack struct mm_struct is quite large (~1664 bytes) and so allocating on the stack may cause problems as the kernel stack size is small. Since ptdump_walk_pgd_level_core() was only allocating the structure so that it could modify the pgd argument we can instead introduce a pgd override in struct mm_walk and pass this down the call stack to where it is needed. Since the correct mm_struct is now being passed down, it is now also unnecessary to take the mmap_sem semaphore because ptdump_walk_pgd() will now take the semaphore on the real mm. [steven.price@arm.com: restore missed arm64 changes] Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Signed-off-by: Steven Price Reported-by: Stephen Rothwell Cc: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 3 +++ include/linux/ptdump.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 745a654c6ea7..b1cb6b753abb 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -74,6 +74,7 @@ enum page_walk_action { * mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk + * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) * @vma: vma currently walked (NULL if walking outside vmas) * @action: next action to perform (see enum page_walk_action) * @no_vma: walk ignoring vmas (vma will always be NULL) @@ -84,6 +85,7 @@ enum page_walk_action { struct mm_walk { const struct mm_walk_ops *ops; struct mm_struct *mm; + pgd_t *pgd; struct vm_area_struct *vma; enum page_walk_action action; bool no_vma; @@ -95,6 +97,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, void *private); int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index b28f3f2acf90..a67065c403c3 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -17,6 +17,6 @@ struct ptdump_state { const struct ptdump_range *range; }; -void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm); +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); #endif /* _LINUX_PTDUMP_H */ -- cgit v1.2.3 From 0ed1325967ab5f7a4549a2641c6ebe115f76e228 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:36:49 -0800 Subject: mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush Architectures for which we have hardware walkers of Linux page table should flush TLB on mmu gather batch allocation failures and batch flush. Some architectures like POWER supports multiple translation modes (hash and radix) and in the case of POWER only radix translation mode needs the above TLBI. This is because for hash translation mode kernel wants to avoid this extra flush since there are no hardware walkers of linux page table. With radix translation, the hardware also walks linux page table and with that, kernel needs to make sure to TLB invalidate page walk cache before page table pages are freed. More details in commit d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE") The changes to sparc are to make sure we keep the old behavior since we are now removing HAVE_RCU_TABLE_NO_INVALIDATE. The default value for tlb_needs_table_invalidate is to always force an invalidate and sparc can avoid the table invalidate. Hence we define tlb_needs_table_invalidate to false for sparc architecture. Link: http://lkml.kernel.org/r/20200116064531.483522-3-aneesh.kumar@linux.ibm.com Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes") Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman [powerpc] Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 2b10036fefd0..9e22ac369d1d 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -137,13 +137,6 @@ * When used, an architecture is expected to provide __tlb_remove_table() * which does the actual freeing of these pages. * - * HAVE_RCU_TABLE_NO_INVALIDATE - * - * This makes HAVE_RCU_TABLE_FREE avoid calling tlb_flush_mmu_tlbonly() before - * freeing the page-table pages. This can be avoided if you use - * HAVE_RCU_TABLE_FREE and your architecture does _NOT_ use the Linux - * page-tables natively. - * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). @@ -189,8 +182,23 @@ struct mmu_table_batch { extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +/* + * This allows an architecture that does not use the linux page-tables for + * hardware to skip the TLBI when freeing page tables. + */ +#ifndef tlb_needs_table_invalidate +#define tlb_needs_table_invalidate() (true) +#endif + +#else + +#ifdef tlb_needs_table_invalidate +#error tlb_needs_table_invalidate() requires HAVE_RCU_TABLE_FREE #endif +#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ + + #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers -- cgit v1.2.3 From 0758cd8304942292e95a0f750c374533db378b32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:36:53 -0800 Subject: asm-generic/tlb: avoid potential double flush Aneesh reported that: tlb_flush_mmu() tlb_flush_mmu_tlbonly() tlb_flush() <-- #1 tlb_flush_mmu_free() tlb_table_flush() tlb_table_invalidate() tlb_flush_mmu_tlbonly() tlb_flush() <-- #2 does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not clear tlb->end in that case. Observe that any caller to __tlb_adjust_range() also sets at least one of the tlb->freed_tables || tlb->cleared_p* bits, and those are unconditionally cleared by __tlb_reset_range(). Change the condition for actually issuing TLBI to having one of those bits set, as opposed to having tlb->end != 0. Link: http://lkml.kernel.org/r/20200116064531.483522-4-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Reported-by: "Aneesh Kumar K.V" Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 9e22ac369d1d..b36b3bef5661 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -402,7 +402,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - if (!tlb->end) + /* + * Anything calling __tlb_adjust_range() also sets at least one of + * these bits. + */ + if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || + tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); -- cgit v1.2.3 From 491a49ff14b2475ad84c97b93c0182ae6daf8b37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:36:56 -0800 Subject: asm-gemeric/tlb: remove stray function declarations We removed the actual functions a while ago. Link: http://lkml.kernel.org/r/20200116064531.483522-5-aneesh.kumar@linux.ibm.com Fixes: 1808d65b55e4 ("asm-generic/tlb: Remove arch_tlb*_mmu()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b36b3bef5661..1a4cea5f95df 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -285,11 +285,7 @@ struct mmu_gather { #endif }; -void arch_tlb_gather_mmu(struct mmu_gather *tlb, - struct mm_struct *mm, unsigned long start, unsigned long end); void tlb_flush_mmu(struct mmu_gather *tlb); -void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, -- cgit v1.2.3 From ff2e6d7259f82ccc9a5aaa7f41194161d9262392 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:02 -0800 Subject: asm-generic/tlb: rename HAVE_RCU_TABLE_FREE Towards a more consistent naming scheme. [akpm@linux-foundation.org: fix sparc64 Kconfig] Link: http://lkml.kernel.org/r/20200116064531.483522-7-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1a4cea5f95df..04a1b8f08eea 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -126,7 +126,7 @@ * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * - * HAVE_RCU_TABLE_FREE + * MMU_GATHER_RCU_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). This provides separate freeing of @@ -142,7 +142,7 @@ * Use this if your architecture lacks an efficient flush_tlb_range(). */ -#ifdef CONFIG_HAVE_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * @@ -193,10 +193,10 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else #ifdef tlb_needs_table_invalidate -#error tlb_needs_table_invalidate() requires HAVE_RCU_TABLE_FREE +#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER @@ -235,7 +235,7 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, struct mmu_gather { struct mm_struct *mm; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct mmu_table_batch *batch; #endif -- cgit v1.2.3 From 3af4bd033759c4dab4f0ff594f0aa1e8d182b9d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:05 -0800 Subject: asm-generic/tlb: rename HAVE_MMU_GATHER_PAGE_SIZE Towards a more consistent naming scheme. Link: http://lkml.kernel.org/r/20200116064531.483522-8-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 04a1b8f08eea..53befa5acb27 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -121,11 +121,14 @@ * * Additionally there are a few opt-in features: * - * HAVE_MMU_GATHER_PAGE_SIZE + * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * + * This might be useful if your architecture has size specific TLB + * invalidation instructions. + * * MMU_GATHER_RCU_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() @@ -279,7 +282,7 @@ struct mmu_gather { struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; -#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif @@ -435,7 +438,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { -#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); -- cgit v1.2.3 From 580a586c409ab3040b7284a19cd9e281692c40c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:08 -0800 Subject: asm-generic/tlb: rename HAVE_MMU_GATHER_NO_GATHER Towards a more consistent naming scheme. Link: http://lkml.kernel.org/r/20200116064531.483522-9-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 53befa5acb27..ca0fe75b5355 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -143,6 +143,16 @@ * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). + * + * MMU_GATHER_NO_GATHER + * + * If the option is set the mmu_gather will not track individual pages for + * delayed page free anymore. A platform that enables the option needs to + * provide its own implementation of the __tlb_remove_page_size() function to + * free pages. + * + * This is useful if your architecture already flushes TLB entries in the + * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE @@ -202,7 +212,7 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. @@ -277,7 +287,7 @@ struct mmu_gather { unsigned int batch_count; -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; -- cgit v1.2.3 From 0d6e24d430ef23280d8dea0ba1faeefc66c26a57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:11 -0800 Subject: asm-generic/tlb: provide MMU_GATHER_TABLE_FREE As described in the comment, the correct order for freeing pages is: 1) unhook page 2) TLB invalidate page 3) free page This order equally applies to page directories. Currently there are two correct options: - use tlb_remove_page(), when all page directores are full pages and there are no futher contraints placed by things like software walkers (HAVE_FAST_GUP). - use MMU_GATHER_RCU_TABLE_FREE and tlb_remove_table() when the architecture does not do IPI based TLB invalidate and has HAVE_FAST_GUP (or software TLB fill). This however leaves architectures that don't have page based directories but don't need RCU in a bind. For those, provide MMU_GATHER_TABLE_FREE, which provides the independent batching for directories without the additional RCU freeing. Link: http://lkml.kernel.org/r/20200116064531.483522-10-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 72 ++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index ca0fe75b5355..f391f6b500b4 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -56,6 +56,15 @@ * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * + * - tlb_remove_table() + * + * tlb_remove_table() is the basic primitive to free page-table directories + * (__p*_free_tlb()). In it's most primitive form it is an alias for + * tlb_remove_page() below, for when page directories are pages and have no + * additional constraints. + * + * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. + * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() * @@ -129,17 +138,24 @@ * This might be useful if your architecture has size specific TLB * invalidation instructions. * - * MMU_GATHER_RCU_TABLE_FREE + * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() - * for page directores (__p*_free_tlb()). This provides separate freeing of - * the page-table pages themselves in a semi-RCU fashion (see comment below). - * Useful if your architecture doesn't use IPIs for remote TLB invalidates - * and therefore doesn't naturally serialize with software page-table walkers. + * for page directores (__p*_free_tlb()). + * + * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() * which does the actual freeing of these pages. * + * MMU_GATHER_RCU_TABLE_FREE + * + * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see + * comment below). + * + * Useful if your architecture doesn't use IPIs for remote TLB invalidates + * and therefore doesn't naturally serialize with software page-table walkers. + * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). @@ -155,37 +171,12 @@ * various ptep_get_and_clear() functions. */ -#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE -/* - * Semi RCU freeing of the page directories. - * - * This is needed by some architectures to implement software pagetable walkers. - * - * gup_fast() and other software pagetable walkers do a lockless page-table - * walk and therefore needs some synchronization with the freeing of the page - * directories. The chosen means to accomplish that is by disabling IRQs over - * the walk. - * - * Architectures that use IPIs to flush TLBs will then automagically DTRT, - * since we unlink the page, flush TLBs, free the page. Since the disabling of - * IRQs delays the completion of the TLB flush we can never observe an already - * freed page. - * - * Architectures that do not have this (PPC) need to delay the freeing by some - * other means, this is that means. - * - * What we do is batch the freed directory pages (tables) and RCU free them. - * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling - * holds off grace periods. - * - * However, in order to batch these pages we need to allocate storage, this - * allocation is deep inside the MM code and can thus easily fail on memory - * pressure. To guarantee progress we fall back to single table freeing, see - * the implementation of tlb_remove_table_one(). - * - */ +#ifdef CONFIG_MMU_GATHER_TABLE_FREE + struct mmu_table_batch { +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; +#endif unsigned int nr; void *tables[0]; }; @@ -195,6 +186,17 @@ struct mmu_table_batch { extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ + +/* + * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based + * page directories and we can use the normal page batching to free them. + */ +#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) + +#endif /* CONFIG_MMU_GATHER_TABLE_FREE */ + +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. @@ -248,7 +250,7 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, struct mmu_gather { struct mm_struct *mm; -#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif -- cgit v1.2.3 From d56c0d45f0e27f814e87a1676b6bdccccbc252e9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:14 -0800 Subject: proc: decouple proc from VFS with "struct proc_ops" Currently core /proc code uses "struct file_operations" for custom hooks, however, VFS doesn't directly call them. Every time VFS expands file_operations hook set, /proc code bloats for no reason. Introduce "struct proc_ops" which contains only those hooks which /proc allows to call into (open, release, read, write, ioctl, mmap, poll). It doesn't contain module pointer as well. Save ~184 bytes per usage: add/remove: 26/26 grow/shrink: 1/4 up/down: 1922/-6674 (-4752) Function old new delta sysvipc_proc_ops - 72 +72 ... config_gz_proc_ops - 72 +72 proc_get_inode 289 339 +50 proc_reg_get_unmapped_area 110 107 -3 close_pdeo 227 224 -3 proc_reg_open 289 284 -5 proc_create_data 60 53 -7 rt_cpu_seq_fops 256 - -256 ... default_affinity_proc_fops 256 - -256 Total: Before=5430095, After=5425343, chg -0.09% Link: http://lkml.kernel.org/r/20191225172228.GA13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0640be56dcbd..3dfa92633af3 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -12,6 +12,21 @@ struct proc_dir_entry; struct seq_file; struct seq_operations; +struct proc_ops { + int (*proc_open)(struct inode *, struct file *); + ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + loff_t (*proc_lseek)(struct file *, loff_t, int); + int (*proc_release)(struct inode *, struct file *); + __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); + long (*proc_ioctl)(struct file *, unsigned int, unsigned long); +#ifdef CONFIG_COMPAT + long (*proc_compat_ioctl)(struct file *, unsigned int, unsigned long); +#endif + int (*proc_mmap)(struct file *, struct vm_area_struct *); + unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +}; + #ifdef CONFIG_PROC_FS typedef int (*proc_write_t)(struct file *, char *, size_t); @@ -43,10 +58,10 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *, - const struct file_operations *, + const struct proc_ops *, void *); -struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); extern void *PDE_DATA(const struct inode *); @@ -108,8 +123,8 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, #define proc_create_seq(name, mode, parent, ops) ({NULL;}) #define proc_create_single(name, mode, parent, show) ({NULL;}) #define proc_create_single_data(name, mode, parent, show, data) ({NULL;}) -#define proc_create(name, mode, parent, proc_fops) ({NULL;}) -#define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;}) +#define proc_create(name, mode, parent, proc_ops) ({NULL;}) +#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;}) static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -- cgit v1.2.3 From 97a32539b9568bb653683349e5a76d02ff3c3e2c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:17 -0800 Subject: proc: convert everything to "struct proc_ops" The most notable change is DEFINE_SHOW_ATTRIBUTE macro split in seq_file.h. Conversion rule is: llseek => proc_lseek unlocked_ioctl => proc_ioctl xxx => proc_xxx delete ".owner = THIS_MODULE" line [akpm@linux-foundation.org: fix drivers/isdn/capi/kcapi_proc.c] [sfr@canb.auug.org.au: fix kernel/sched/psi.c] Link: http://lkml.kernel.org/r/20200122180545.36222f50@canb.auug.org.au Link: http://lkml.kernel.org/r/20191225172546.GB13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 13 +++++++++++++ include/linux/sunrpc/stats.h | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 5998e1f4ff06..770c2bf3aa43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -160,6 +160,19 @@ static const struct file_operations __name ## _fops = { \ .release = single_release, \ } +#define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct proc_ops __name ## _proc_ops = { \ + .proc_open = __name ## _open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ +} + static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h index 84b92b4ad1c0..d94d4f410507 100644 --- a/include/linux/sunrpc/stats.h +++ b/include/linux/sunrpc/stats.h @@ -63,7 +63,7 @@ struct proc_dir_entry * rpc_proc_register(struct net *,struct rpc_stat *); void rpc_proc_unregister(struct net *,const char *); void rpc_proc_zero(const struct rpc_program *); struct proc_dir_entry * svc_proc_register(struct net *, struct svc_stat *, - const struct file_operations *); + const struct proc_ops *); void svc_proc_unregister(struct net *, const char *); void svc_seq_show(struct seq_file *, @@ -75,7 +75,7 @@ static inline void rpc_proc_unregister(struct net *net, const char *p) {} static inline void rpc_proc_zero(const struct rpc_program *p) {} static inline struct proc_dir_entry *svc_proc_register(struct net *net, struct svc_stat *s, - const struct file_operations *f) { return NULL; } + const struct proc_ops *proc_ops) { return NULL; } static inline void svc_proc_unregister(struct net *net, const char *p) {} static inline void svc_seq_show(struct seq_file *seq, -- cgit v1.2.3 From 0bee0cece2a6a71ccc347fdc1d46cf638cd5fd1c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:20 -0800 Subject: lib/string: add strnchrnul() Patch series "lib: rework bitmap_parse", v5. Similarl to the recently revisited bitmap_parselist(), bitmap_parse() is ineffective and overcomplicated. This series reworks it, aligns its interface with bitmap_parselist() and makes it simpler to use. The series also adds a test for the function and fixes usage of it in cpumask_parse() according to the new design - drops the calculating of length of an input string. bitmap_parse() takes the array of numbers to be put into the map in the BE order which is reversed to the natural LE order for bitmaps. For example, to construct bitmap containing a bit on the position 42, we have to put a line '400,0'. Current implementation reads chunk one by one from the beginning ('400' before '0') and makes bitmap shift after each successful parse. It makes the complexity of the whole process as O(n^2). We can do it in reverse direction ('0' before '400') and avoid shifting, but it requires reverse parsing helpers. This patch (of 7): New function works like strchrnul() with a length limited string. Link: http://lkml.kernel.org/r/20200102043031.30357-2-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Rasmus Villemoes Cc: Amritha Nambiar Cc: Willem de Bruijn Cc: Kees Cook Cc: Matthew Wilcox Cc: "Tobin C . Harding" Cc: Will Deacon Cc: Miklos Szeredi Cc: Vineet Gupta Cc: Chris Wilson Cc: Arnaldo Carvalho de Melo Cc: Steffen Klassert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 02894e417565..6dfbb2efa815 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -62,6 +62,7 @@ extern char * strchr(const char *,int); #ifndef __HAVE_ARCH_STRCHRNUL extern char * strchrnul(const char *,int); #endif +extern char * strnchrnul(const char *, size_t, int); #ifndef __HAVE_ARCH_STRNCHR extern char * strnchr(const char *, size_t, int); #endif -- cgit v1.2.3 From 0bddc1bd05d6973fee9303005abab6567f743ea7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:24 -0800 Subject: bitops: more BITS_TO_* macros Introduce BITS_TO_U64, BITS_TO_U32 and BITS_TO_BYTES as they are handy in the following patches (BITS_TO_U32 specifically). Reimplement tools/ version of the macros according to the kernel implementation. Also fix indentation for BITS_PER_TYPE definition. Link: http://lkml.kernel.org/r/20200102043031.30357-3-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Amritha Nambiar Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 6c7c4133c25c..47f54b459c26 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -11,8 +11,10 @@ # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) +#define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); -- cgit v1.2.3 From 2d6261583be005a91e4933aa53bbd678ef98e4c4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:34 -0800 Subject: lib: rework bitmap_parse() bitmap_parse() is ineffective and full of opaque variables and opencoded parts. It leads to hard understanding and usage of it. This rework includes: - remove bitmap_shift_left() call from the cycle. Now it makes the complexity of the algorithm as O(nbits^2). In the suggested approach the input string is parsed in reverse direction, so no shifts needed; - relax requirement on a single comma and no white spaces between chunks. It is considered useful in scripting, and it aligns with bitmap_parselist(); - split bitmap_parse() to small readable helpers; - make an explicit calculation of the end of input line at the beginning, so users of the bitmap_parse() won't bother doing this. Link: http://lkml.kernel.org/r/20200102043031.30357-6-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amritha Nambiar Cc: Andy Shevchenko Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 80ad521116d7..e52ceb1a73d3 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -186,7 +186,7 @@ bitmap_find_next_zero_area(unsigned long *map, align_mask, 0); } -extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, +extern int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits); extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); @@ -454,12 +454,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline int bitmap_parse(const char *buf, unsigned int buflen, - unsigned long *maskp, int nmaskbits) -{ - return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); -} - static inline void bitmap_next_clear_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) -- cgit v1.2.3 From 190535f7cf50f2d6d6e603715201c58cd6ec696b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:41 -0800 Subject: include/linux/cpumask.h: don't calculate length of the input string New design of inner bitmap_parse() allows to avoid calculating the size of a null-terminated string. Link: http://lkml.kernel.org/r/20200102043031.30357-8-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Amritha Nambiar Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 78a73eba64dd..d5cc88514aee 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -663,9 +663,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { - unsigned int len = strchrnul(buf, '\n') - buf; - - return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); + return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** -- cgit v1.2.3