From fe124c95df9e2acf202910b0510300e37afe074b Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 11 Aug 2020 18:32:12 -0700 Subject: x86/mm: use max memory block size on bare metal Some of our servers spend significant time at kernel boot initializing memory block sysfs directories and then creating symlinks between them and the corresponding nodes. The slowness happens because the machines get stuck with the smallest supported memory block size on x86 (128M), which results in 16,288 directories to cover the 2T of installed RAM. The search for each memory block is noticeable even with commit 4fb6eabf1037 ("drivers/base/memory.c: cache memory blocks in xarray to accelerate lookup"). Commit 078eb6aa50dc ("x86/mm/memory_hotplug: determine block size based on the end of boot memory") chooses the block size based on alignment with memory end. That addresses hotplug failures in qemu guests, but for bare metal systems whose memory end isn't aligned to even the smallest size, it leaves them at 128M. Make kernels that aren't running on a hypervisor use the largest supported size (2G) to minimize overhead on big machines. Kernel boot goes 7% faster on the aforementioned servers, shaving off half a second. [daniel.m.jordan@oracle.com: v3] Link: http://lkml.kernel.org/r/20200714205450.945834-1-daniel.m.jordan@oracle.com Signed-off-by: Daniel Jordan Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Cc: Andy Lutomirski Cc: Dave Hansen Cc: David Hildenbrand Cc: Michal Hocko Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Steven Sistare Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200609225451.3542648-1-daniel.m.jordan@oracle.com Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b246ae40c8f..a4ac13cc3fdc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1452,6 +1452,15 @@ static unsigned long probe_memory_block_size(void) goto done; } + /* + * Use max block size to minimize overhead on bare metal, where + * alignment for memory hotplug isn't a concern. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = MAX_BLOCK_SIZE; + goto done; + } + /* Find the largest allowed block size that aligns to memory end */ for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) -- cgit v1.2.3 From d622ecec5f57ee7e27fbc6db222d8cbaf3c59478 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 11 Aug 2020 18:32:16 -0700 Subject: mm/memory_hotplug: introduce default dummy memory_add_physaddr_to_nid() This is to introduce a general dummy helper. memory_add_physaddr_to_nid() is a fallback option to get the nid in case NUMA_NO_NID is detected. After this patch, arm64/sh/s390 can simply use the general dummy version. PowerPC/x86/ia64 will still use their specific version. This is the preparation to set a fallback value for dev_dax->target_node. Signed-off-by: Jia He Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Michal Hocko Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Vishal Verma Cc: Dave Jiang Cc: Baoquan He Cc: Chuhong Yuan Cc: Mike Rapoport Cc: Logan Gunthorpe Cc: Masahiro Yamada Cc: Jonathan Cameron Cc: Kaly Xin Link: http://lkml.kernel.org/r/20200710031619.18762-2-justin.he@arm.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/numa.c | 10 ---------- arch/ia64/mm/numa.c | 2 -- arch/sh/mm/init.c | 9 --------- arch/x86/mm/numa.c | 1 - 4 files changed, 22 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index aafcee3e3f7e..73f8b49d485c 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -461,13 +461,3 @@ void __init arm64_numa_init(void) numa_init(dummy_numa_init); } - -/* - * We hope that we will be hotplugging memory on nodes we already know about, - * such that acpi_get_node() succeeds and we never fall back to this... - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - pr_warn("Unknown node for memory at 0x%llx, assuming node 0\n", addr); - return 0; -} diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 5e1015eb6d0d..f34964271101 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -106,7 +106,5 @@ int memory_add_physaddr_to_nid(u64 addr) return 0; return nid; } - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #endif diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 613de8096335..cd1379360f08 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -425,15 +425,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -#ifdef CONFIG_NUMA -int memory_add_physaddr_to_nid(u64 addr) -{ - /* Node 0 for now.. */ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b05f45e5e8e2..aa76ec2d359b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -929,5 +929,4 @@ int memory_add_physaddr_to_nid(u64 start) nid = numa_meminfo.blk[0].nid; return nid; } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -- cgit v1.2.3 From c56771b3095d4f1af82847081b6781fc03a13904 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Tue, 11 Aug 2020 18:32:43 -0700 Subject: sh/mm: drop unused MAX_PHYSADDR_BITS The macro is not used anywhere, so remove the definition. Signed-off-by: Arvind Sankar Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Dave Hansen Acked-by: Mike Rapoport Link: http://lkml.kernel.org/r/20200723231544.17274-3-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- arch/sh/include/asm/sparsemem.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/sparsemem.h b/arch/sh/include/asm/sparsemem.h index 4eb899751e45..084706bb8cca 100644 --- a/arch/sh/include/asm/sparsemem.h +++ b/arch/sh/include/asm/sparsemem.h @@ -5,11 +5,9 @@ #ifdef __KERNEL__ /* * SECTION_SIZE_BITS 2^N: how big each section will be - * MAX_PHYSADDR_BITS 2^N: how much physical address space we have - * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + * MAX_PHYSMEM_BITS 2^N: how much physical address space we have */ #define SECTION_SIZE_BITS 26 -#define MAX_PHYSADDR_BITS 32 #define MAX_PHYSMEM_BITS 32 #endif -- cgit v1.2.3 From 6f8c00ff5aa88c847af9af35131d72fae480b566 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Tue, 11 Aug 2020 18:32:46 -0700 Subject: sparc: drop unused MAX_PHYSADDR_BITS The macro is not used anywhere, so remove the definition. Signed-off-by: Arvind Sankar Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Dave Hansen Acked-by: David S. Miller Acked-by: Mike Rapoport Link: http://lkml.kernel.org/r/20200723231544.17274-4-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/sparsemem.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/include/asm/sparsemem.h b/arch/sparc/include/asm/sparsemem.h index 1dd1b61432db..aa9a676bc341 100644 --- a/arch/sparc/include/asm/sparsemem.h +++ b/arch/sparc/include/asm/sparsemem.h @@ -7,7 +7,6 @@ #include #define SECTION_SIZE_BITS 30 -#define MAX_PHYSADDR_BITS MAX_PHYS_ADDRESS_BITS #define MAX_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS #endif /* !(__KERNEL__) */ -- cgit v1.2.3 From bfe00c5bbd9ee37b99c281429556c335271d027b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:34 -0700 Subject: syscalls: use uaccess_kernel in addr_limit_user_check Patch series "clean up address limit helpers", v2. In preparation for eventually phasing out direct use of set_fs(), this series removes the segment_eq() arch helper that is only used to implement or duplicate the uaccess_kernel() API, and then adds descriptive helpers to force the kernel address limit. This patch (of 6): Use the uaccess_kernel helper instead of duplicating it. [hch@lst.de: arm: don't call addr_limit_user_check for nommu] Link: http://lkml.kernel.org/r/20200721045834.GA9613@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Tested-by: Guenter Roeck Acked-by: Linus Torvalds Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200714105505.935079-1-hch@lst.de Link: http://lkml.kernel.org/r/20200710135706.537715-1-hch@lst.de Link: http://lkml.kernel.org/r/20200710135706.537715-2-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ab2568996ddb..c9dc912b83f0 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -713,7 +713,9 @@ struct page *get_signal_page(void) /* Defer to generic check */ asmlinkage void addr_limit_check_failed(void) { +#ifdef CONFIG_MMU addr_limit_user_check(); +#endif } #ifdef CONFIG_DEBUG_RSEQ -- cgit v1.2.3 From 9af0f90aea46081566bf2014b6dfeb87b792005c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:38 -0700 Subject: nds32: use uaccess_kernel in show_regs Use the uaccess_kernel helper instead of duplicating it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Greentime Hu Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200710135706.537715-3-hch@lst.de Signed-off-by: Linus Torvalds --- arch/nds32/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index e85bbbadc0e7..e01ad5d17224 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -121,7 +121,7 @@ void show_regs(struct pt_regs *regs) regs->uregs[3], regs->uregs[2], regs->uregs[1], regs->uregs[0]); pr_info(" IRQs o%s Segment %s\n", interrupts_enabled(regs) ? "n" : "ff", - segment_eq(get_fs(), KERNEL_DS)? "kernel" : "user"); + uaccess_kernel() ? "kernel" : "user"); } EXPORT_SYMBOL(show_regs); -- cgit v1.2.3 From efbfc62e1d9ce835fd3ea5f25db1844e1496ced6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:41 -0700 Subject: riscv: include in To ensure TASK_SIZE is defined for USER_DS. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Palmer Dabbelt Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200710135706.537715-4-hch@lst.de Signed-off-by: Linus Torvalds --- arch/riscv/include/asm/uaccess.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 8ce9d607b53d..22de922d6ecb 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -8,6 +8,8 @@ #ifndef _ASM_RISCV_UACCESS_H #define _ASM_RISCV_UACCESS_H +#include /* for TASK_SIZE */ + /* * User space memory access functions */ -- cgit v1.2.3 From 428e2976a5bf7e7f5554286d7a5a33b8147b106a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:44 -0700 Subject: uaccess: remove segment_eq segment_eq is only used to implement uaccess_kernel. Just open code uaccess_kernel in the arch uaccess headers and remove one layer of indirection. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-5-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/uaccess.h | 2 +- arch/arc/include/asm/segment.h | 3 +-- arch/arm/include/asm/uaccess.h | 4 ++-- arch/arm64/include/asm/uaccess.h | 2 +- arch/csky/include/asm/segment.h | 2 +- arch/h8300/include/asm/segment.h | 2 +- arch/ia64/include/asm/uaccess.h | 2 +- arch/m68k/include/asm/segment.h | 2 +- arch/microblaze/include/asm/uaccess.h | 2 +- arch/mips/include/asm/uaccess.h | 2 +- arch/nds32/include/asm/uaccess.h | 2 +- arch/nios2/include/asm/uaccess.h | 2 +- arch/openrisc/include/asm/uaccess.h | 2 +- arch/parisc/include/asm/uaccess.h | 2 +- arch/powerpc/include/asm/uaccess.h | 3 +-- arch/riscv/include/asm/uaccess.h | 4 +--- arch/s390/include/asm/uaccess.h | 2 +- arch/sh/include/asm/segment.h | 3 +-- arch/sparc/include/asm/uaccess_32.h | 2 +- arch/sparc/include/asm/uaccess_64.h | 2 +- arch/x86/include/asm/uaccess.h | 2 +- arch/xtensa/include/asm/uaccess.h | 2 +- 22 files changed, 23 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 1fe2b56cb861..1b6f25efa247 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -20,7 +20,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * Is a address valid? This does a straightforward calculation rather diff --git a/arch/arc/include/asm/segment.h b/arch/arc/include/asm/segment.h index 6a2a5be5026d..871f8ab11bfd 100644 --- a/arch/arc/include/asm/segment.h +++ b/arch/arc/include/asm/segment.h @@ -14,8 +14,7 @@ typedef unsigned long mm_segment_t; #define KERNEL_DS MAKE_MM_SEG(0) #define USER_DS MAKE_MM_SEG(TASK_SIZE) - -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) #endif /* __ASSEMBLY__ */ #endif /* __ASMARC_SEGMENT_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index b5fdd30252f8..a13d90206472 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -76,7 +76,7 @@ static inline void set_fs(mm_segment_t fs) modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* * We use 33-bit arithmetic here. Success returns zero, failure returns @@ -267,7 +267,7 @@ extern int __put_user_8(void *, unsigned long long); */ #define USER_DS KERNEL_DS -#define segment_eq(a, b) (1) +#define uaccess_kernel() (true) #define __addr_ok(addr) ((void)(addr), 1) #define __range_ok(addr, size) ((void)(addr), 0) #define get_fs() (KERNEL_DS) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 8d7c466f809b..991dd5f031e4 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -50,7 +50,7 @@ static inline void set_fs(mm_segment_t fs) CONFIG_ARM64_UAO)); } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* * Test whether a block of memory is a valid user space address. diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h index db2640d5f575..79ede9b1a646 100644 --- a/arch/csky/include/asm/segment.h +++ b/arch/csky/include/asm/segment.h @@ -13,6 +13,6 @@ typedef struct { #define USER_DS ((mm_segment_t) { 0x80000000UL }) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASM_CSKY_SEGMENT_H */ diff --git a/arch/h8300/include/asm/segment.h b/arch/h8300/include/asm/segment.h index a407978f9f9f..37950725d9b9 100644 --- a/arch/h8300/include/asm/segment.h +++ b/arch/h8300/include/asm/segment.h @@ -33,7 +33,7 @@ static inline mm_segment_t get_fs(void) return USER_DS; } -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASSEMBLY__ */ diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 8aa473a4b0f4..179243c3dfc7 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -50,7 +50,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * When accessing user memory, we need to make sure the entire area really is in diff --git a/arch/m68k/include/asm/segment.h b/arch/m68k/include/asm/segment.h index c6686559e9b7..2b5e68a71ef7 100644 --- a/arch/m68k/include/asm/segment.h +++ b/arch/m68k/include/asm/segment.h @@ -52,7 +52,7 @@ static inline void set_fs(mm_segment_t val) #define set_fs(x) (current_thread_info()->addr_limit = (x)) #endif -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASSEMBLY__ */ diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 6723c56ec378..304b04ffea2f 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -41,7 +41,7 @@ # define get_fs() (current_thread_info()->addr_limit) # define set_fs(val) (current_thread_info()->addr_limit = (val)) -# define segment_eq(a, b) ((a).seg == (b).seg) +# define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #ifndef CONFIG_MMU diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 62b298c50905..61fc01f177a6 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -72,7 +72,7 @@ extern u64 __ua_limit; #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * eva_kernel_access() - determine whether kernel memory access on an EVA system diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 3a9219f53ee0..010ba5f1d7dd 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -44,7 +44,7 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) #define __range_ok(addr, size) (size <= get_fs() && addr <= (get_fs() -size)) diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index e83f831a76f9..a741abbed6fb 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -30,7 +30,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(seg) (current_thread_info()->addr_limit = (seg)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define __access_ok(addr, len) \ (((signed long)(((long)get_fs().seg) & \ diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index 17c24f14615f..48b691530d3e 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -43,7 +43,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* Ensure that the range from addr to addr+size is all within the process' * address space diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ebbb9ffe038c..ed2cd4fb479b 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -14,7 +14,7 @@ #define KERNEL_DS ((mm_segment_t){0}) #define USER_DS ((mm_segment_t){1}) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 64c04ab09112..00699903f1ef 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -38,8 +38,7 @@ static inline void set_fs(mm_segment_t fs) set_thread_flag(TIF_FSCHECK); } -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (get_fs().seg) #ifdef __powerpc64__ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 22de922d6ecb..f56c66b3f5fe 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -64,11 +64,9 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; } -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (get_fs().seg) - /** * access_ok: - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 324438889fe1..f09444d6aeab 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -32,7 +32,7 @@ #define USER_DS_SACF (3) #define get_fs() (current->thread.mm_segment) -#define segment_eq(a,b) (((a) & 2) == ((b) & 2)) +#define uaccess_kernel() ((get_fs() & 2) == KERNEL_DS) void set_fs(mm_segment_t fs); diff --git a/arch/sh/include/asm/segment.h b/arch/sh/include/asm/segment.h index 33d1d28057cb..02e54a3335d6 100644 --- a/arch/sh/include/asm/segment.h +++ b/arch/sh/include/asm/segment.h @@ -24,8 +24,7 @@ typedef struct { #define USER_DS KERNEL_DS #endif -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index d6d8413eca83..0a2d3ebc4bb8 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -28,7 +28,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) ((current->thread.current_ds) = (val)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* We have there a nice not-mapped page at PAGE_OFFSET - PAGE_SIZE, so that this test * can be fairly lightweight. diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index bf9d330073b2..698cf69f74e9 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -32,7 +32,7 @@ #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)}) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define set_fs(val) \ do { \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2f3e8f2a958f..ecefaffd15d4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,7 +33,7 @@ static inline void set_fs(mm_segment_t fs) set_thread_flag(TIF_FSCHECK); } -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (current->thread.addr_limit.seg) /* diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index e57f0d0a88d8..b9758119feca 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -35,7 +35,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) (current->thread.current_ds = (val)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define __kernel_ok (uaccess_kernel()) #define __user_ok(addr, size) \ -- cgit v1.2.3 From 3d13f313ce4c34c524ccc37986fe77172f601ff3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:47 -0700 Subject: uaccess: add force_uaccess_{begin,end} helpers Add helpers to wrap the get_fs/set_fs magic for undoing any damange done by set_fs(KERNEL_DS). There is no real functional benefit, but this documents the intent of these calls better, and will allow stubbing the functions out easily for kernels builds that do not allow address space overrides in the future. [hch@lst.de: drop two incorrect hunks, fix a commit log typo] Link: http://lkml.kernel.org/r/20200714105505.935079-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Mark Rutland Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-6-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/kernel/sdei.c | 2 +- arch/m68k/include/asm/tlbflush.h | 6 +++--- arch/mips/kernel/unaligned.c | 27 +++++++++++++-------------- arch/nds32/mm/alignment.c | 7 +++---- arch/sh/kernel/traps_32.c | 12 +++++------- 5 files changed, 25 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index dab88260b137..7689f2031c0c 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -180,7 +180,7 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, /* * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s set_fs(USER_DS) call. + * by sdei_event_handler()s force_uaccess_begin() call. */ __uaccess_enable_hw_pan(); diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index 191e75a6bb24..5337bc2c262f 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -85,10 +85,10 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = get_fs(); - set_fs(USER_DS); + mm_segment_t old_fs = force_uaccess_begin(); + __flush_tlb_one(addr); - set_fs(old_fs); + force_uaccess_end(old_fs); } } diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 0adce604fa44..126a5f3f4e4c 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -191,17 +191,16 @@ static void emulate_load_store_insn(struct pt_regs *regs, * memory, so we need to "switch" the address limit to * user space, so that address check can work properly. */ - seg = get_fs(); - set_fs(USER_DS); + seg = force_uaccess_begin(); switch (insn.spec3_format.func) { case lhe_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -209,12 +208,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lwe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -222,12 +221,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lhue_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWUE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -235,35 +234,35 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case she_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; case swe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; default: - set_fs(seg); + force_uaccess_end(seg); goto sigill; } - set_fs(seg); + force_uaccess_end(seg); } #endif break; diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c index c8b9061a2ee3..1eb7ded6992b 100644 --- a/arch/nds32/mm/alignment.c +++ b/arch/nds32/mm/alignment.c @@ -512,7 +512,7 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) { unsigned long inst; int ret = -EFAULT; - mm_segment_t seg = get_fs(); + mm_segment_t seg; inst = get_inst(regs->ipc); @@ -520,13 +520,12 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) "Faulting addr: 0x%08lx, pc: 0x%08lx [inst: 0x%08lx ]\n", addr, regs->ipc, inst); - set_fs(USER_DS); - + seg = force_uaccess_begin(); if (inst & NDS32_16BIT_INSTRUCTION) ret = do_16((inst >> 16) & 0xffff, regs); else ret = do_32(inst, regs); - set_fs(seg); + force_uaccess_end(seg); return ret; } diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 058c6181bb30..b62ad0ba2395 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -482,8 +482,6 @@ asmlinkage void do_address_error(struct pt_regs *regs, error_code = lookup_exception_vector(); #endif - oldfs = get_fs(); - if (user_mode(regs)) { int si_code = BUS_ADRERR; unsigned int user_action; @@ -491,13 +489,13 @@ asmlinkage void do_address_error(struct pt_regs *regs, local_irq_enable(); inc_unaligned_user_access(); - set_fs(USER_DS); + oldfs = force_uaccess_begin(); if (copy_from_user(&instruction, (insn_size_t *)(regs->pc & ~1), sizeof(instruction))) { - set_fs(oldfs); + force_uaccess_end(oldfs); goto uspace_segv; } - set_fs(oldfs); + force_uaccess_end(oldfs); /* shout about userspace fixups */ unaligned_fixups_notify(current, instruction, regs); @@ -520,11 +518,11 @@ fixup: goto uspace_segv; } - set_fs(USER_DS); + oldfs = force_uaccess_begin(); tmp = handle_unaligned_access(instruction, regs, &user_mem_access, 0, address); - set_fs(oldfs); + force_uaccess_end(oldfs); if (tmp == 0) return; /* sorted */ -- cgit v1.2.3 From bd72866b8da499e60633ff28f8a4f6e09ca78efe Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Tue, 11 Aug 2020 18:33:54 -0700 Subject: alpha: fix annotation of io{read,write}{16,32}be() These accessors must be used to read/write a big-endian bus. The value returned or written is native-endian. However, these accessors are defined using be{16,32}_to_cpu() or cpu_to_be{16,32}() to make the endian conversion but these expect a __be{16,32} when none is present. Keeping them would need a force cast that would solve nothing at all. So, do the conversion using swab{16,32}, like done in asm-generic for similar situations. Reported-by: kernel test robot Signed-off-by: Luc Van Oostenryck Signed-off-by: Andrew Morton Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Stephen Boyd Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200622114232.80039-1-luc.vanoostenryck@gmail.com Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/io.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index a4d0c19f1e79..640e1a2f57b4 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -489,10 +489,10 @@ extern inline void writeq(u64 b, volatile void __iomem *addr) } #endif -#define ioread16be(p) be16_to_cpu(ioread16(p)) -#define ioread32be(p) be32_to_cpu(ioread32(p)) -#define iowrite16be(v,p) iowrite16(cpu_to_be16(v), (p)) -#define iowrite32be(v,p) iowrite32(cpu_to_be32(v), (p)) +#define ioread16be(p) swab16(ioread16(p)) +#define ioread32be(p) swab32(ioread32(p)) +#define iowrite16be(v,p) iowrite16(swab16(v), (p)) +#define iowrite32be(v,p) iowrite32(swab32(v), (p)) #define inb_p inb #define inw_p inw -- cgit v1.2.3 From bce617edecada007aee8610fbe2c14d10b8de2f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:44 -0700 Subject: mm: do page fault accounting in handle_mm_fault Patch series "mm: Page fault accounting cleanups", v5. This is v5 of the pf accounting cleanup series. It originates from Gerald Schaefer's report on an issue a week ago regarding to incorrect page fault accountings for retried page fault after commit 4064b9827063 ("mm: allow VM_FAULT_RETRY for multiple times"): https://lore.kernel.org/lkml/20200610174811.44b94525@thinkpad/ What this series did: - Correct page fault accounting: we do accounting for a page fault (no matter whether it's from #PF handling, or gup, or anything else) only with the one that completed the fault. For example, page fault retries should not be counted in page fault counters. Same to the perf events. - Unify definition of PERF_COUNT_SW_PAGE_FAULTS: currently this perf event is used in an adhoc way across different archs. Case (1): for many archs it's done at the entry of a page fault handler, so that it will also cover e.g. errornous faults. Case (2): for some other archs, it is only accounted when the page fault is resolved successfully. Case (3): there're still quite some archs that have not enabled this perf event. Since this series will touch merely all the archs, we unify this perf event to always follow case (1), which is the one that makes most sense. And since we moved the accounting into handle_mm_fault, the other two MAJ/MIN perf events are well taken care of naturally. - Unify definition of "major faults": the definition of "major fault" is slightly changed when used in accounting (not VM_FAULT_MAJOR). More information in patch 1. - Always account the page fault onto the one that triggered the page fault. This does not matter much for #PF handlings, but mostly for gup. More information on this in patch 25. Patchset layout: Patch 1: Introduced the accounting in handle_mm_fault(), not enabled. Patch 2-23: Enable the new accounting for arch #PF handlers one by one. Patch 24: Enable the new accounting for the rest outliers (gup, iommu, etc.) Patch 25: Cleanup GUP task_struct pointer since it's not needed any more This patch (of 25): This is a preparation patch to move page fault accountings into the general code in handle_mm_fault(). This includes both the per task flt_maj/flt_min counters, and the major/minor page fault perf events. To do this, the pt_regs pointer is passed into handle_mm_fault(). PERF_COUNT_SW_PAGE_FAULTS should still be kept in per-arch page fault handlers. So far, all the pt_regs pointer that passed into handle_mm_fault() is NULL, which means this patch should have no intented functional change. Suggested-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Greentime Hu Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James E.J. Bottomley Cc: John Hubbard Cc: Jonas Bonn Cc: Ley Foon Tan Cc: "Luck, Tony" Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200707225021.200906-1-peterx@redhat.com Link: http://lkml.kernel.org/r/20200707225021.200906-2-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/csky/mm/fault.c | 3 ++- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/riscv/mm/fault.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 4 ++-- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- 24 files changed, 26 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index c2303a8c2b9f..1983e43a5e2f 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -148,7 +148,7 @@ retry: /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 7287c793d1c9..587dea524e6b 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -130,7 +130,7 @@ retry: goto bad_area; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c6550eddfce1..01a8e0f8fef7 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -224,7 +224,7 @@ good_area: goto out; } - return handle_mm_fault(vma, addr & PAGE_MASK, flags); + return handle_mm_fault(vma, addr & PAGE_MASK, flags, NULL); check_stack: /* Don't allow expansion below FIRST_USER_ADDRESS */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8afb238ff335..be29f4076fe3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -428,7 +428,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, */ if (!(vma->vm_flags & vm_flags)) return VM_FAULT_BADACCESS; - return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, NULL); } static bool is_el0_instruction_abort(unsigned int esr) diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index b1dce9f2f04d..b252e6e4d32f 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -150,7 +150,8 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0, + NULL); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index cd3808f96b93..f12f330e7946 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -88,7 +88,7 @@ good_area: break; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3a4dec334cc5..abf2808f9b4b 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -143,7 +143,7 @@ retry: * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 508abb63da67..08b35a318ebe 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -134,7 +134,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); pr_debug("handle_mm_fault returns %x\n", fault); if (fault_signal_pending(fault, regs)) diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index a2bfe587b491..1a3d4c4ca28b 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -214,7 +214,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 01b168a90434..b1db39784db9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -152,7 +152,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 8fb73f6401a0..d0ecc8fb5b23 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -206,7 +206,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, NULL); /* * If we need to retry but a fatal signal is pending, handle the diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index 4112ef0e247e..86beb9a2698e 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -131,7 +131,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index d2224ccca294..3daa491d1edb 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -159,7 +159,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 66ac0719bd49..e32d06928c24 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -302,7 +302,7 @@ good_area: * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index b83abbead4a2..2d0276abe0a6 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -64,7 +64,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 925a7231abb3..c6a5225a3521 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -511,7 +511,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); major |= fault & VM_FAULT_MAJOR; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 5873835a3e6b..30c1124d0fb6 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -109,7 +109,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, NULL); /* * If we need to retry but a fatal signal is pending, handle the diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index aebf9183bedd..ad783aaaf649 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -476,7 +476,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index fbe1f2fe9a8c..3c0a11827f7e 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -482,7 +482,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) if (mm_fault_error(regs, error_code, address, fault)) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index cfef656eda0f..06af03db4417 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -234,7 +234,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; @@ -410,7 +410,7 @@ good_area: if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(vma, address, flags)) { + switch (handle_mm_fault(vma, address, flags, NULL)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index a3806614e4dc..9ebee14ee893 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -422,7 +422,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) goto exit_exception; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 2b3afa354a90..8d9870d76da1 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -71,7 +71,7 @@ good_area: do { vm_fault_t fault; - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0c7643d9f7cb..e1bf5555d80a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1291,7 +1291,7 @@ good_area: * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); major |= fault & VM_FAULT_MAJOR; /* Quick path to respond to signals */ diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index c128dcc7c85b..e72c8c1359a6 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -107,7 +107,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; -- cgit v1.2.3 From c0f6eda41f97dbe4e4feda914c18d019b9b9fccd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:49 -0700 Subject: mm/alpha: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Link: http://lkml.kernel.org/r/20200707225021.200906-3-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 1983e43a5e2f..09172f017efc 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *); @@ -116,6 +117,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, #endif if (user_mode(regs)) flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); vma = find_vma(mm, address); @@ -148,7 +150,7 @@ retry: /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -164,10 +166,6 @@ retry: } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 52e3f8d03052036ce97296915a3746421a1da1d0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:52 -0700 Subject: mm/arc: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Fix PERF_COUNT_SW_PAGE_FAULTS perf event manually for page fault retries, by moving it before taking mmap_sem. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Vineet Gupta Link: http://lkml.kernel.org/r/20200707225021.200906-4-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arc/mm/fault.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 587dea524e6b..f5657cb68e4f 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -105,6 +105,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) if (write) flags |= FAULT_FLAG_WRITE; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); @@ -130,7 +131,7 @@ retry: goto bad_area; } - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -155,22 +156,9 @@ bad_area: * Major/minor page fault accounting * (in case of retry we only land here once) */ - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - - if (likely(!(fault & VM_FAULT_ERROR))) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - + if (likely(!(fault & VM_FAULT_ERROR))) /* Normal return path: fault Handled Gracefully */ return; - } if (!user_mode(regs)) goto no_context; -- cgit v1.2.3 From 79fea6c6548e28400d7870c61477d35aecb7baf8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:54 -0700 Subject: mm/arm: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. To do this, we need to pass the pt_regs pointer into __do_page_fault(). Fix PERF_COUNT_SW_PAGE_FAULTS perf event manually for page fault retries, by moving it before taking mmap_sem. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200707225021.200906-5-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 01a8e0f8fef7..efa402025031 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -202,7 +202,8 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) static vm_fault_t __kprobes __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) + unsigned int flags, struct task_struct *tsk, + struct pt_regs *regs) { struct vm_area_struct *vma; vm_fault_t fault; @@ -224,7 +225,7 @@ good_area: goto out; } - return handle_mm_fault(vma, addr & PAGE_MASK, flags, NULL); + return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); check_stack: /* Don't allow expansion below FIRST_USER_ADDRESS */ @@ -266,6 +267,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) flags |= FAULT_FLAG_WRITE; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -290,7 +293,7 @@ retry: #endif } - fault = __do_page_fault(mm, addr, fsr, flags, tsk); + fault = __do_page_fault(mm, addr, fsr, flags, tsk, regs); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_lock because @@ -302,23 +305,7 @@ retry: return 0; } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, addr); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, addr); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; goto retry; -- cgit v1.2.3 From 6a1bb025d28e1026fead73b7b33e2dfccba3f4d2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:57 -0700 Subject: mm/arm64: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. To do this, we pass pt_regs pointer into __do_page_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Will Deacon Cc: Catalin Marinas Link: http://lkml.kernel.org/r/20200707225021.200906-6-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/fault.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index be29f4076fe3..f07333e86c2f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -404,7 +404,8 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADACCESS 0x020000 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, - unsigned int mm_flags, unsigned long vm_flags) + unsigned int mm_flags, unsigned long vm_flags, + struct pt_regs *regs) { struct vm_area_struct *vma = find_vma(mm, addr); @@ -428,7 +429,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, */ if (!(vma->vm_flags & vm_flags)) return VM_FAULT_BADACCESS; - return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, NULL); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs); } static bool is_el0_instruction_abort(unsigned int esr) @@ -450,7 +451,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, { const struct fault_info *inf; struct mm_struct *mm = current->mm; - vm_fault_t fault, major = 0; + vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; @@ -516,8 +517,7 @@ retry: #endif } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags); - major |= fault & VM_FAULT_MAJOR; + fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -538,25 +538,8 @@ retry: * Handle the "normal" (no error) case first. */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | - VM_FAULT_BADACCESS)))) { - /* - * Major/minor page fault accounting is only done - * once. If we go through a retry, it is extremely - * likely that the page will be found in page cache at - * that point. - */ - if (major) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, - addr); - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, - addr); - } - + VM_FAULT_BADACCESS)))) return 0; - } /* * If we are in kernel mode at this point, we have no context to -- cgit v1.2.3 From a2a9e439baf8aca2af1e614fab7956c09091a6d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:00 -0700 Subject: mm/csky: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Guo Ren Link: http://lkml.kernel.org/r/20200707225021.200906-7-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/csky/mm/fault.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index b252e6e4d32f..081b178b41b1 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -151,7 +151,7 @@ good_area: * the fault. */ fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0, - NULL); + regs); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -161,16 +161,6 @@ good_area: goto bad_area; BUG(); } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, - address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, - address); - } - mmap_read_unlock(mm); return; -- cgit v1.2.3 From e08157c3c4239fac29879143019c96498ba9c2bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:03 -0700 Subject: mm/hexagon: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Brian Cain Link: http://lkml.kernel.org/r/20200707225021.200906-8-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/hexagon/mm/vm_fault.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index f12f330e7946..ef32c5a84ff3 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * Decode of hardware exception sends us to one of several @@ -53,6 +54,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) if (user_mode(regs)) flags |= FAULT_FLAG_USER; + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); vma = find_vma(mm, address); @@ -88,7 +91,7 @@ good_area: break; } - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -96,10 +99,6 @@ good_area: /* The most common case -- we are done. */ if (likely(!(fault & VM_FAULT_ERROR))) { if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; goto retry; -- cgit v1.2.3 From b444eed891cf8656af1f59d239f77c5338481774 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:06 -0700 Subject: mm/ia64: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: "Luck, Tony" Link: http://lkml.kernel.org/r/20200707225021.200906-9-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/ia64/mm/fault.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index abf2808f9b4b..cd9766d2b6e0 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,8 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re flags |= FAULT_FLAG_USER; if (mask & VM_WRITE) flags |= FAULT_FLAG_WRITE; + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); @@ -143,7 +146,7 @@ retry: * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -166,10 +169,6 @@ retry: } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From e1c17f627b4232e2c7e9edb7151fa60a746150ee Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:09 -0700 Subject: mm/m68k: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200707225021.200906-10-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/m68k/mm/fault.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 08b35a318ebe..795f483b1050 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -84,6 +85,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (user_mode(regs)) flags |= FAULT_FLAG_USER; + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); @@ -134,7 +137,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); pr_debug("handle_mm_fault returns %x\n", fault); if (fault_signal_pending(fault, regs)) @@ -150,16 +153,7 @@ good_area: BUG(); } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From aeb6aefc3129307858e4d5a4b73ae28f1871ae05 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:12 -0700 Subject: mm/microblaze: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200707225021.200906-11-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/microblaze/mm/fault.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 1a3d4c4ca28b..b3fed2cecf84 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -121,6 +122,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, if (user_mode(regs)) flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -214,7 +217,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -230,10 +233,6 @@ good_area: } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (unlikely(fault & VM_FAULT_MAJOR)) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 2558fd7f5c3eda31d4474c7cdc8dc4b3bb6526f5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:15 -0700 Subject: mm/mips: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Fix PERF_COUNT_SW_PAGE_FAULTS perf event manually for page fault retries, by moving it before taking mmap_sem. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Thomas Bogendoerfer Link: http://lkml.kernel.org/r/20200707225021.200906-12-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/mips/mm/fault.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index b1db39784db9..7c871b14e74a 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -96,6 +96,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, if (user_mode(regs)) flags |= FAULT_FLAG_USER; + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); vma = find_vma(mm, address); @@ -152,12 +154,11 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -168,15 +169,6 @@ good_area: BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - tsk->maj_flt++; - } else { - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - tsk->min_flt++; - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From daf7bf5d90397bc0f3d7b45030461f55eb9c74fa Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:19 -0700 Subject: mm/nds32: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Fix PERF_COUNT_SW_PAGE_FAULTS perf event manually for page fault retries, by moving it before taking mmap_sem. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Greentime Hu Cc: Nick Hu Cc: Vincent Chen Link: http://lkml.kernel.org/r/20200707225021.200906-13-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/nds32/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index d0ecc8fb5b23..f02524eb6d56 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -121,6 +121,8 @@ void do_page_fault(unsigned long entry, unsigned long addr, if (unlikely(faulthandler_disabled() || !mm)) goto no_context; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -206,7 +208,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, addr, flags, NULL); + fault = handle_mm_fault(vma, addr, flags, regs); /* * If we need to retry but a fatal signal is pending, handle the @@ -228,22 +230,7 @@ good_area: goto bad_area; } - /* - * Major/minor page fault accounting is only done on the initial - * attempt. If we go through a retry, it is extremely likely that the - * page will be found in page cache at that point. - */ - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, regs, addr); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, regs, addr); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 4487dcf9b75180eb5115c196ca9bd6ebefade5b3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:22 -0700 Subject: mm/nios2: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Ley Foon Tan Link: http://lkml.kernel.org/r/20200707225021.200906-14-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/nios2/mm/fault.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index 86beb9a2698e..9476feecf512 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -83,6 +84,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, if (user_mode(regs)) flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + if (!mmap_read_trylock(mm)) { if (!user_mode(regs) && !search_exception_tables(regs->ea)) goto bad_area_nosemaphore; @@ -131,7 +134,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -146,16 +149,7 @@ good_area: BUG(); } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 38caa902dccad61e02273c18a633fc5be91aeca5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:25 -0700 Subject: mm/openrisc: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Stafford Horne Cc: Jonas Bonn Cc: Stefan Kristiansson Link: http://lkml.kernel.org/r/20200707225021.200906-15-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/openrisc/mm/fault.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 3daa491d1edb..ca97d9baab51 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -103,6 +104,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, if (in_interrupt() || !mm) goto no_context; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: mmap_read_lock(mm); vma = find_vma(mm, address); @@ -159,7 +162,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -176,10 +179,6 @@ good_area: if (flags & FAULT_FLAG_ALLOW_RETRY) { /*RGD modeled on Cris */ - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From af8a7926273645dc81f9e7f5c3e18136abebf05b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:28 -0700 Subject: mm/parisc: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Add the missing PERF_COUNT_SW_PAGE_FAULTS perf events too. Note, the other two perf events (PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN]) were done in handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: James E.J. Bottomley Cc: Helge Deller Link: http://lkml.kernel.org/r/20200707225021.200906-16-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/parisc/mm/fault.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index e32d06928c24..4bfe2da9fbe3 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -281,6 +282,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, acc_type = parisc_acctyp(code, regs->iir); if (acc_type & VM_WRITE) flags |= FAULT_FLAG_WRITE; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: mmap_read_lock(mm); vma = find_vma_prev(mm, address, &prev_vma); @@ -302,7 +304,7 @@ good_area: * fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -323,10 +325,6 @@ good_area: BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { /* * No need to mmap_read_unlock(mm) as we would -- cgit v1.2.3 From 428fdc094492e720c342f1c934e7972cbc328d13 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:31 -0700 Subject: mm/powerpc: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Link: http://lkml.kernel.org/r/20200707225021.200906-17-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/powerpc/mm/fault.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index c6a5225a3521..0add963a849b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -511,7 +511,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); major |= fault & VM_FAULT_MAJOR; @@ -537,14 +537,9 @@ retry: /* * Major/minor page fault accounting. */ - if (major) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + if (major) cmo_account_page_fault(); - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); - } + return 0; } NOKPROBE_SYMBOL(__do_page_fault); -- cgit v1.2.3 From 5ac365a458902214adfbe3567c94e114cc91cde6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:34 -0700 Subject: mm/riscv: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Pekka Enberg Acked-by: Palmer Dabbelt Cc: Paul Walmsley Cc: Albert Ou Link: http://lkml.kernel.org/r/20200707225021.200906-18-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/riscv/mm/fault.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 30c1124d0fb6..716d64e36f83 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -109,7 +109,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, addr, flags, NULL); + fault = handle_mm_fault(vma, addr, flags, regs); /* * If we need to retry but a fatal signal is pending, handle the @@ -127,21 +127,7 @@ good_area: BUG(); } - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, regs, addr); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, regs, addr); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 35e45f3e5a1fe652df2153f2b1c7dd234d448356 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:37 -0700 Subject: mm/s390: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Gerald Schaefer Acked-by: Gerald Schaefer Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Link: http://lkml.kernel.org/r/20200707225021.200906-19-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/s390/mm/fault.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ad783aaaf649..4c8c063bce5b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -476,7 +476,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) @@ -486,21 +486,7 @@ retry: if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } if (fault & VM_FAULT_RETRY) { if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { -- cgit v1.2.3 From 105f886220e944b6aa01accfad59af49341703c4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:40 -0700 Subject: mm/sh: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Yoshinori Sato Cc: Rich Felker Link: http://lkml.kernel.org/r/20200707225021.200906-20-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/sh/mm/fault.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3c0a11827f7e..482668a2f6d3 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -482,22 +482,13 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) if (mm_fault_error(regs, error_code, address, fault)) return; if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 56e10e6ab11924b19b7c9583750fdd4e440e25c8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:43 -0700 Subject: mm/sparc32: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: David S. Miller Link: http://lkml.kernel.org/r/20200707225021.200906-21-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/sparc/mm/fault_32.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 06af03db4417..8071bfd72349 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -234,7 +234,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -250,15 +250,6 @@ good_area: } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, regs, address); - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, regs, address); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From f08147df4092dd5093b85dbccbd34a7843fb60bf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:46 -0700 Subject: mm/sparc64: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: David S. Miller Link: http://lkml.kernel.org/r/20200707225021.200906-22-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/sparc/mm/fault_64.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 9ebee14ee893..0a6bcc85fba7 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -422,7 +422,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) goto exit_exception; @@ -438,15 +438,6 @@ good_area: } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, - 1, regs, address); - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, - 1, regs, address); - } if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 968614fc7b8410e1ee99d0111015a1bf3e955f64 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:49 -0700 Subject: mm/x86: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/20200707225021.200906-23-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e1bf5555d80a..35f1498e9832 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1139,7 +1139,7 @@ void do_user_addr_fault(struct pt_regs *regs, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - vm_fault_t fault, major = 0; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; @@ -1291,8 +1291,7 @@ good_area: * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ - fault = handle_mm_fault(vma, address, flags, NULL); - major |= fault & VM_FAULT_MAJOR; + fault = handle_mm_fault(vma, address, flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -1319,18 +1318,6 @@ good_area: return; } - /* - * Major/minor page fault accounting. If any of the events - * returned VM_FAULT_MAJOR, we account it as a major fault. - */ - if (major) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); - } - check_v8086_mode(regs, address, tsk); } NOKPROBE_SYMBOL(do_user_addr_fault); -- cgit v1.2.3 From 484e51e4af528a408b8e0f2db4865625169279cf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:53 -0700 Subject: mm/xtensa: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. Remove the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf events because it's now also done in handle_mm_fault(). Move the PERF_COUNT_SW_PAGE_FAULTS event higher before taking mmap_sem for the fault, then it'll match with the rest of the archs. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Max Filippov Cc: Chris Zankel Link: http://lkml.kernel.org/r/20200707225021.200906-24-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/xtensa/mm/fault.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index e72c8c1359a6..7666408ce12a 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -72,6 +72,9 @@ void do_page_fault(struct pt_regs *regs) if (user_mode(regs)) flags |= FAULT_FLAG_USER; + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: mmap_read_lock(mm); vma = find_vma(mm, address); @@ -107,7 +110,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, NULL); + fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) return; @@ -122,10 +125,6 @@ good_area: BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; @@ -139,12 +138,6 @@ good_area: } mmap_read_unlock(mm); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (flags & VM_FAULT_MAJOR) - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); - else - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); - return; /* Something tried to access memory that isn't in our memory map.. -- cgit v1.2.3 From a2beb5f1efede6924a4258462a5660572e6ca864 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:38:57 -0700 Subject: mm: clean up the last pieces of page fault accountings Here're the last pieces of page fault accounting that were still done outside handle_mm_fault() where we still have regs==NULL when calling handle_mm_fault(): arch/powerpc/mm/copro_fault.c: copro_handle_mm_fault arch/sparc/mm/fault_32.c: force_user_fault arch/um/kernel/trap.c: handle_page_fault mm/gup.c: faultin_page fixup_user_fault mm/hmm.c: hmm_vma_fault mm/ksm.c: break_ksm Some of them has the issue of duplicated accounting for page fault retries. Some of them didn't do the accounting at all. This patch cleans all these up by letting handle_mm_fault() to do per-task page fault accounting even if regs==NULL (though we'll still skip the perf event accountings). With that, we can safely remove all the outliers now. There's another functional change in that now we account the page faults to the caller of gup, rather than the task_struct that passed into the gup code. More information of this can be found at [1]. After this patch, below things should never be touched again outside handle_mm_fault(): - task_struct.[maj|min]_flt - PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] [1] https://lore.kernel.org/lkml/CAHk-=wj_V2Tps2QrMn20_W0OJF9xqNh52XSGA42s-ZJ8Y+GyKw@mail.gmail.com/ Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Greentime Hu Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James E.J. Bottomley Cc: John Hubbard Cc: Jonas Bonn Cc: Ley Foon Tan Cc: "Luck, Tony" Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200707225021.200906-25-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/powerpc/mm/copro_fault.c | 5 ----- arch/um/kernel/trap.c | 4 ---- 2 files changed, 9 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 2d0276abe0a6..8acd00178956 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -76,11 +76,6 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, BUG(); } - if (*flt & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - out_unlock: mmap_read_unlock(mm); return ret; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 8d9870d76da1..ad12f78bda7e 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -88,10 +88,6 @@ good_area: BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; -- cgit v1.2.3 From 64019a2e467a288a16b65ab55ddcbf58c1b00187 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:39:01 -0700 Subject: mm/gup: remove task_struct pointer for all gup code After the cleanup of page fault accounting, gup does not need to pass task_struct around any more. Remove that parameter in the whole gup stack. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Link: http://lkml.kernel.org/r/20200707225021.200906-26-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arc/kernel/process.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/priv.c | 8 ++++---- arch/s390/mm/gmap.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index e12c80d71b78..efeba1fe7252 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -91,7 +91,7 @@ fault: goto fail; mmap_read_lock(current->mm); - ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr, + ret = fixup_user_fault(current->mm, (unsigned long) uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(current->mm); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1608fd99bbee..2f177298c663 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2768,7 +2768,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr) struct page *page = NULL; mmap_read_lock(kvm->mm); - get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE, + get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, &page, NULL, NULL); mmap_read_unlock(kvm->mm); return page; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 66da278a67fb..6b74b92c1a58 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1892,7 +1892,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) { - r = fixup_user_fault(current, current->mm, hva, + r = fixup_user_fault(current->mm, hva, FAULT_FLAG_WRITE, &unlocked); if (r) break; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 2f721a923b54..cd74989ce0b0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -273,7 +273,7 @@ retry: rc = get_guest_storage_key(current->mm, vmaddr, &key); if (rc) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { mmap_read_unlock(current->mm); @@ -319,7 +319,7 @@ retry: mmap_read_lock(current->mm); rc = reset_guest_reference_bit(current->mm, vmaddr); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { mmap_read_unlock(current->mm); @@ -390,7 +390,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) m3 & SSKE_MC); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } @@ -1094,7 +1094,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) rc = cond_set_guest_storage_key(current->mm, vmaddr, key, NULL, nq, mr, mc); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 190357ff86b3..8747487c50a8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -649,7 +649,7 @@ retry: rc = vmaddr; goto out_up; } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked)) { rc = -EFAULT; goto out_up; @@ -879,7 +879,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) /* lost mmap_lock, caller has to retry __gmap_translate */ -- cgit v1.2.3