From d32311fed70d12f14e585feb4653571b1e2b0e6d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Sep 2005 14:41:40 +1000 Subject: [PATCH] Introduce sg_set_buf sg_init_one is a nice tool for the block layer. However, users of struct scatterlist in other subsystems don't usually need the DMA attributes. For them it's a waste of time and space to initialise the whole struct scatterlist structure. Therefore this patch adds a new function sg_set_buf to initialise a scatterlist without zeroing the DMA attributes. Signed-off-by: Herbert Xu --- include/linux/scatterlist.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 7f717e95ae37..66ff545552f7 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -1,14 +1,23 @@ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H -static inline void sg_init_one(struct scatterlist *sg, - u8 *buf, unsigned int buflen) -{ - memset(sg, 0, sizeof(*sg)); +#include +#include +#include +static inline void sg_set_buf(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ sg->page = virt_to_page(buf); sg->offset = offset_in_page(buf); sg->length = buflen; } +static inline void sg_init_one(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ + memset(sg, 0, sizeof(*sg)); + sg_set_buf(sg, buf, buflen); +} + #endif /* _LINUX_SCATTERLIST_H */ -- cgit v1.2.3 From 930fc45a49ddebe7555cc5c837d82b9c27e65ff4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:15:41 -0700 Subject: [PATCH] vmalloc_node This patch adds vmalloc_node(size, node) -> Allocate necessary memory on the specified node and get_vm_area_node(size, flags, node) and the other functions that it depends on. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3701a0673d2c..1d5577b2b752 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -32,10 +32,14 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); +extern void *vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot); +extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, + pgprot_t prot, int node); extern void vfree(void *addr); extern void *vmap(struct page **pages, unsigned int count, @@ -48,6 +52,8 @@ extern void vunmap(void *addr); extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct *get_vm_area_node(unsigned long size, + unsigned long flags, int node); extern struct vm_struct *remove_vm_area(void *addr); extern struct vm_struct *__remove_vm_area(void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, -- cgit v1.2.3 From eb92f4ef320b738e41ad43476a5d05c8a20d5cc7 Mon Sep 17 00:00:00 2001 From: Rik Van Riel Date: Sat, 29 Oct 2005 18:15:44 -0700 Subject: [PATCH] add sem_is_read/write_locked() Add sem_is_read/write_locked functions to the read/write semaphores, along the same lines of the *_is_locked spinlock functions. The swap token tuning patch uses sem_is_read_locked; sem_is_write_locked is added for completeness. Signed-off-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rwsem-spinlock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index b52a2af25f1f..f30f805080ae 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem)); extern void FASTCALL(__up_write(struct rw_semaphore *sem)); extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->activity != 0); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ -- cgit v1.2.3 From dfcd3c0dc426bb75770c34b40e14f2da8845ea62 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Oct 2005 18:15:48 -0700 Subject: [PATCH] Convert mempolicies to nodemask_t The NUMA policy code predated nodemask_t so it used open coded bitmaps. Convert everything to nodemask_t. Big patch, but shouldn't have any actual behaviour changes (except I removed one unnecessary check against node_online_map and one unnecessary BUG_ON) Signed-off-by: "Andi Kleen" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 58385ee1c0ac..38e60a099399 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -27,10 +27,10 @@ #include #include -#include #include #include #include +#include struct vm_area_struct; @@ -63,7 +63,7 @@ struct mempolicy { union { struct zonelist *zonelist; /* bind */ short preferred_node; /* preferred */ - DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ + nodemask_t nodes; /* interleave */ /* undefined for default */ } v; }; -- cgit v1.2.3 From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:56 -0700 Subject: [PATCH] mm: vm_stat_account unshackled The original vm_stat_account has fallen into disuse, with only one user, and only one user of vm_stat_unaccount. It's easier to keep track if we convert them all to __vm_stat_account, then free it from its __shackles. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e1649578fb0c..376a466743bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, unsigned long, pgprot_t); #ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else -static inline void __vm_stat_account(struct mm_struct *mm, +static inline void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { } #endif /* CONFIG_PROC_FS */ -static inline void vm_stat_account(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); -} - -static inline void vm_stat_unaccount(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - -vma_pages(vma)); -} - /* update per process rss and vm hiwater data */ extern void update_mem_hiwater(struct task_struct *tsk); -- cgit v1.2.3 From a8fb5618dab7e45c8990f3155628d772a9ed45f9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:57 -0700 Subject: [PATCH] mm: unlink_file_vma, remove_vma Divide remove_vm_struct into two parts: first anon_vma_unlink plus unlink_file_vma, to unlink the vma from the list and tree by which rmap or vmtruncate might find it; then remove_vma to close, fput and free. The intention here is to do the anon_vma_unlink and unlink_file_vma earlier, in free_pgtables before freeing any page tables: so we can be sure that any page tables traversed by rmap and vmtruncate are stable (and other, ordinary cases are stabilized by holding mmap_sem). This will be crucial to traversing pgd,pud,pmd without page_table_lock. But testing the split-out patch showed that lifting the page_table_lock is symbiotically necessary to make this change - the lock ordering is wrong to move those unlinks into free_pgtables while it's under ptlock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 376a466743bc..0c64484d8ae0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,6 +834,7 @@ extern int split_vma(struct mm_struct *, extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); -- cgit v1.2.3 From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:05 -0700 Subject: [PATCH] mm: rss = file_rss + anon_rss I was lazy when we added anon_rss, and chose to change as few places as possible. So currently each anonymous page has to be counted twice, in rss and in anon_rss. Which won't be so good if those are atomic counts in some configurations. Change that around: keep file_rss and anon_rss separately, and add them together (with get_mm_rss macro) when the total is needed - reading two atomics is much cheaper than updating two atomics. And update anon_rss upfront, typically in memory.c, not tucked away in page_add_anon_rmap. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 27519df0f987..afcaac66cbd5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- +#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) + typedef unsigned long mm_counter_t; struct mm_struct { @@ -286,7 +288,7 @@ struct mm_struct { unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; /* Special counters protected by the page_table_lock */ - mm_counter_t _rss; + mm_counter_t _file_rss; mm_counter_t _anon_rss; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ -- cgit v1.2.3 From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 29 Oct 2005 18:16:12 -0700 Subject: [PATCH] core remove PageReserved Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearing of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed (a generic page_is_ram() should work). A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. Signed-off-by: Nick Piggin Refcount bug fix for filemap_xip.c Signed-off-by: Carsten Otte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c64484d8ae0..da42093250c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_RESERVED 0x00080000 /* Pages managed in a special way */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ @@ -338,7 +338,7 @@ static inline void get_page(struct page *page) static inline void put_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); -- cgit v1.2.3 From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:18 -0700 Subject: [PATCH] mm: update_hiwaters just in time update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc//status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- include/linux/sched.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index da42093250c3..7d4552fe0864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm, } #endif /* CONFIG_PROC_FS */ -/* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(struct task_struct *tsk); - #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/include/linux/sched.h b/include/linux/sched.h index afcaac66cbd5..a9c0b7d26303 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define dec_mm_counter(mm, member) (mm)->_##member-- #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +#define update_hiwater_rss(mm) do { \ + unsigned long _rss = get_mm_rss(mm); \ + if ((mm)->hiwater_rss < _rss) \ + (mm)->hiwater_rss = _rss; \ +} while (0) +#define update_hiwater_vm(mm) do { \ + if ((mm)->hiwater_vm < (mm)->total_vm) \ + (mm)->hiwater_vm = (mm)->total_vm; \ +} while (0) + typedef unsigned long mm_counter_t; struct mm_struct { -- cgit v1.2.3 From f449952bc8bde7fbc73c6d20dff92b627a21f8b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:19 -0700 Subject: [PATCH] mm: mm_struct hiwaters moved Slight and timid rearrangement of mm_struct: hiwater_rss and hiwater_vm were tacked on the end, but it seems better to keep them near _file_rss, _anon_rss and total_vm, in the same cacheline on those arches verified. There are likely to be more profitable rearrangements, but less obvious (is it good or bad that saved_auxv[AT_VECTOR_SIZE] isolates cpu_vm_mask and context from many others?), needing serious instrumentation. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a9c0b7d26303..292cb57ce38f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,16 +291,19 @@ struct mm_struct { * by mmlist_lock */ - unsigned long start_code, end_code, start_data, end_data; - unsigned long start_brk, brk, start_stack; - unsigned long arg_start, arg_end, env_start, env_end; - unsigned long total_vm, locked_vm, shared_vm; - unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; - /* Special counters protected by the page_table_lock */ mm_counter_t _file_rss; mm_counter_t _anon_rss; + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ + + unsigned long total_vm, locked_vm, shared_vm, exec_vm; + unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ unsigned dumpable:2; @@ -320,11 +323,7 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; - struct kioctx default_kioctx; - - unsigned long hiwater_rss; /* High-water RSS usage */ - unsigned long hiwater_vm; /* High-water virtual memory usage */ }; struct sighand_struct { -- cgit v1.2.3 From 46dea3d092d23a58b42499cc8a21de0fad079f4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:20 -0700 Subject: [PATCH] mm: ia64 use expand_upwards ia64 has expand_backing_store function for growing its Register Backing Store vma upwards. But more complete code for this purpose is found in the CONFIG_STACK_GROWSUP part of mm/mmap.c. Uglify its #ifdefs further to provide expand_upwards for ia64 as well as expand_stack for parisc. The Register Backing Store vma should be marked VM_ACCOUNT. Implement the intention of growing it only a page at a time, instead of passing an address outside of the vma to handle_mm_fault, with unknown consequences. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d4552fe0864..89398032bc4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -896,7 +896,8 @@ void handle_ra_miss(struct address_space *mapping, unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ -extern int expand_stack(struct vm_area_struct * vma, unsigned long address); +extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -- cgit v1.2.3 From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:21 -0700 Subject: [PATCH] mm: init_mm without ptlock First step in pushing down the page_table_lock. init_mm.page_table_lock has been used throughout the architectures (usually for ioremap): not to serialize kernel address space allocation (that's usually vmlist_lock), but because pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it. Reverse that: don't lock or unlock init_mm.page_table_lock in any of the architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take and drop it when allocating a new one, to check lest a racing task already did. Similarly no page_table_lock in vmalloc's map_vm_area. Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle user mms, which are converted only by a later patch, for now they have to lock differently according to whether or not it's init_mm. If sources get muddled, there's a danger that an arch source taking init_mm.page_table_lock will be mixed with common source also taking it (or neither take it). So break the rules and make another change, which should break the build for such a mismatch: remove the redundant mm arg from pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13). Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64 used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64 map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free took page_table_lock for no good reason. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 89398032bc4b..b9fa82b96d9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -706,7 +706,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern int vmtruncate(struct inode * inode, loff_t offset); extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); -- cgit v1.2.3 From 1bb3630e89cb8a7b3d3807629c20c5bad88290ff Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:22 -0700 Subject: [PATCH] mm: ptd_alloc inline and out It seems odd to me that, whereas pud_alloc and pmd_alloc test inline, only calling out-of-line __pud_alloc __pmd_alloc if allocation needed, pte_alloc_map and pte_alloc_kernel are entirely out-of-line. Though it does add a little to kernel size, change them to macros testing inline, calling __pte_alloc or __pte_alloc_kernel to allocate out-of-line. Mark none of them as fastcalls, leave that to CONFIG_REGPARM or not. It also seems more natural for the out-of-line functions to leave the offset calculation and map to the inline, which has to do it anyway for the common case. At least mremap move wants __pte_alloc without _map. Macros rather than inline functions, certainly to avoid the header file issues which arise from CONFIG_HIGHPTE needing kmap_types.h, but also in case any architectures I haven't built would have other such problems. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b9fa82b96d9e..22c2d6922c0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -704,10 +704,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, } extern int vmtruncate(struct inode * inode, loff_t offset); -extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -760,32 +756,36 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -/* - * On a two-level or three-level page table, this ends up being trivial. Thus - * the inlining and the symmetry break with pte_alloc_map() that does all - * of this out-of-line. - */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); + /* * The following ifdef needed to get the 4level-fixup.h header to work. * Remove it when 4level-fixup.h has been removed. */ -#ifdef CONFIG_MMU -#ifndef __ARCH_HAS_4LEVEL_HACK +#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - if (pgd_none(*pgd)) - return __pud_alloc(mm, pgd, address); - return pud_offset(pgd, address); + return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? + NULL: pud_offset(pgd, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - if (pud_none(*pud)) - return __pmd_alloc(mm, pud, address); - return pmd_offset(pud, address); + return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? + NULL: pmd_offset(pud, address); } -#endif -#endif /* CONFIG_MMU */ +#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ + +#define pte_alloc_map(mm, pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map(pmd, address)) + +#define pte_alloc_kernel(pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, -- cgit v1.2.3 From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:23 -0700 Subject: [PATCH] mm: ptd_alloc take ptlock Second step in pushing down the page_table_lock. Remove the temporary bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not to hold page_table_lock, whether it's on init_mm or a user mm; take page_table_lock internally to check if a racing task already allocated. Convert their callers from common code. But avoid coming back to change them again later: instead of moving the spin_lock(&mm->page_table_lock) down, switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which encapsulate the mapping+locking and unlocking+unmapping together, and in the end may use alternatives to the mm page_table_lock itself. These callers all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them; and pte_alloc uses the "atomic" pmd_present to test whether it needs to allocate. It appears that on all arches we can safely descend without page_table_lock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 22c2d6922c0e..d4c3512e7db4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#define pte_offset_map_lock(mm, pmd, address, ptlp) \ +({ \ + spinlock_t *__ptl = &(mm)->page_table_lock; \ + pte_t *__pte = pte_offset_map(pmd, address); \ + *(ptlp) = __ptl; \ + spin_lock(__ptl); \ + __pte; \ +}) + +#define pte_unmap_unlock(pte, ptl) do { \ + spin_unlock(ptl); \ + pte_unmap(pte); \ +} while (0) + #define pte_alloc_map(mm, pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + #define pte_alloc_kernel(pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) -- cgit v1.2.3 From 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:30 -0700 Subject: [PATCH] mm: unmap_vmas with inner ptlock Remove the page_table_lock from around the calls to unmap_vmas, and replace the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are now safe to descend without page_table_lock. Don't attempt fancy locking for hugepages, just take page_table_lock in unmap_hugepage_range. Which makes zap_hugepage_range, and the hugetlb test in zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway. Nor does unmap_vmas have much use for its mm arg now. The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without page_table_lock: if they're implemented at all, they typically come down to flush_cache_range (usually done outside page_table_lock) and flush_tlb_range (which we already audited for the mprotect case). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 -- include/linux/mm.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d664330d900e..0cea162b08c0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); -void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); @@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) -#define zap_hugepage_range(vma, start, len) BUG() #define unmap_hugepage_range(vma, start, end) BUG() #define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index d4c3512e7db4..972e2ce8e07c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -682,7 +682,7 @@ struct zap_details { unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); -- cgit v1.2.3 From c0718806cf955d5eb51ea77bffb5b21d9bba4972 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:31 -0700 Subject: [PATCH] mm: rmap with inner ptlock rmap's page_check_address descend without page_table_lock. First just pte_offset_map in case there's no pte present worth locking for, then take page_table_lock for the full check, and pass ptl back to caller in the same style as pte_offset_map_lock. __xip_unmap, page_referenced_one and try_to_unmap_one use pte_unmap_unlock. try_to_unmap_cluster also. page_check_address reformatted to avoid progressive indentation. No use is made of its one error code, return NULL when it fails. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e80fb7ee6efd..35b30e6c8cf8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -95,8 +95,8 @@ int try_to_unmap(struct page *); /* * Called from mm/filemap_xip.c to unmap empty zero page */ -pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long); - +pte_t *page_check_address(struct page *, struct mm_struct *, + unsigned long, spinlock_t **); /* * Used by swapoff to help locate where page is expected in vma. -- cgit v1.2.3 From c34d1b4d165c67b966bca4aba026443d7ff161eb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:32 -0700 Subject: [PATCH] mm: kill check_user_page_readable check_user_page_readable is a problematic variant of follow_page. It's used only by oprofile's i386 and arm backtrace code, at interrupt time, to establish whether a userspace stackframe is currently readable. This is problematic, because we want to push the page_table_lock down inside follow_page, and later split it; whereas oprofile is doing a spin_trylock on it (in the i386 case, forgotten in the arm case), and needs that to pin perhaps two pages spanned by the stackframe (which might be covered by different locks when we split). I think oprofile is going about this in the wrong way: it doesn't need to know the area is readable (neither i386 nor arm uses read protection of user pages), it doesn't need to pin the memory, it should simply __copy_from_user_inatomic, and see if that succeeds or not. Sorry, but I've not got around to devising the sparse __user annotations for this. Then we can eliminate check_user_page_readable, and return to a single follow_page without the __follow_page variants. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 972e2ce8e07c..aa8de20e2e80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -944,7 +944,6 @@ extern struct page * vmalloc_to_page(void *addr); extern unsigned long vmalloc_to_pfn(void *addr); extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write); -extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, unsigned long, pgprot_t); -- cgit v1.2.3 From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:33 -0700 Subject: [PATCH] mm: follow_page with inner ptlock Final step in pushing down common core's page_table_lock. follow_page no longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself; and so no page_table_lock is taken in get_user_pages itself. But get_user_pages (and get_futex_key) do then need follow_page to pin the page for them: take Daniel's suggestion of bitflags to follow_page. Need one for WRITE, another for TOUCH (it was the accessed flag before: vanished along with check_user_page_readable, but surely get_numa_maps is wrong to mark every page it finds as accessed), another for GET. And another, ANON to dispose of untouched_anonymous_page: it seems silly for that to descend a second time, let follow_page observe if there was no page table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED - make_pages_present ought to make readonly anonymous present. Give get_numa_maps a cond_resched while we're there. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index aa8de20e2e80..e8d1424153bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern struct page * vmalloc_to_page(void *addr); -extern unsigned long vmalloc_to_pfn(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); +struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct page *vmalloc_to_page(void *addr); +unsigned long vmalloc_to_pfn(void *addr); +int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + +struct page *follow_page(struct mm_struct *, unsigned long address, + unsigned int foll_flags); +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ #ifdef CONFIG_PROC_FS void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); -- cgit v1.2.3 From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:40 -0700 Subject: [PATCH] mm: split page table lock Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 6 +++--- include/linux/mm.h | 46 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 88af42f5e04a..c937d6e65502 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ - BUG_ON(!PagePrivate(page)); \ - ((struct buffer_head *)(page)->private); \ + BUG_ON(!PagePrivate(page)); \ + ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, { page_cache_get(page); SetPagePrivate(page); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); } static inline void get_bh(struct buffer_head *bh) diff --git a/include/linux/mm.h b/include/linux/mm.h index e8d1424153bb..8a514eca40d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -226,13 +226,18 @@ struct page { * to show when page is mapped * & limit reverse map searches. */ - unsigned long private; /* Mapping-private opaque data: + union { + unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache * When page is free, this indicates * order in the buddy system. */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + spinlock_t ptl; +#endif + } u; struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous @@ -260,6 +265,9 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +#define page_private(page) ((page)->u.private) +#define set_page_private(page, v) ((page)->u.private = (v)) + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); #ifdef CONFIG_HUGETLB_PAGE -static inline int page_count(struct page *p) +static inline int page_count(struct page *page) { - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; + if (PageCompound(page)) + page = (struct page *)page_private(page); + return atomic_read(&page->_count) + 1; } static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) - page = (struct page *)page->private; + page = (struct page *)page_private(page); atomic_inc(&page->_count); } @@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page->private; + return page_private(page); return page->index; } @@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * We tuck a spinlock to guard each pagetable page into its struct page, + * at page->private, with BUILD_BUG_ON to make sure that this will not + * overflow into the next struct page (as it might with DEBUG_SPINLOCK). + * When freeing, reset page->mapping so free_pages_check won't complain. + */ +#define __pte_lockptr(page) &((page)->u.ptl) +#define pte_lock_init(_page) do { \ + spin_lock_init(__pte_lockptr(_page)); \ +} while (0) +#define pte_lock_deinit(page) ((page)->mapping = NULL) +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) +#else +/* + * We use mm->page_table_lock to guard all pagetable pages of the mm. + */ +#define pte_lock_init(page) do {} while (0) +#define pte_lock_deinit(page) do {} while (0) +#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + #define pte_offset_map_lock(mm, pmd, address, ptlp) \ ({ \ - spinlock_t *__ptl = &(mm)->page_table_lock; \ + spinlock_t *__ptl = pte_lockptr(mm, pmd); \ pte_t *__pte = pte_offset_map(pmd, address); \ *(ptlp) = __ptl; \ spin_lock(__ptl); \ -- cgit v1.2.3 From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH] mm: fix rss and mmlist locking A couple of oddities were guarded by page_table_lock, no longer properly guarded when that is split. The mm_counters of file_rss and anon_rss: make those an atomic_t, or an atomic64_t if the architecture supports it, in such a case. Definitions by courtesy of Christoph Lameter: who spent considerable effort on more scalable ways of counting, but found insufficient benefit in practice. And adding an mm with swap to the mmlist for swapoff: the list is well- guarded by its own lock, but the list_empty check now has to be repeated inside it. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 292cb57ce38f..1c30bc308ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +#ifdef ATOMIC64_INIT +#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) +typedef atomic64_t mm_counter_t; +#else /* !ATOMIC64_INIT */ +/* + * The counters wrap back to 0 at 2^32 * PAGE_SIZE, + * that is, at 16TB if using 4kB page size. + */ +#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) +typedef atomic_t mm_counter_t; +#endif /* !ATOMIC64_INIT */ + +#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- -#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +typedef unsigned long mm_counter_t; + +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) #define update_hiwater_rss(mm) do { \ unsigned long _rss = get_mm_rss(mm); \ if ((mm)->hiwater_rss < _rss) \ @@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -typedef unsigned long mm_counter_t; - struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -291,7 +323,9 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters protected by the page_table_lock */ + /* Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ mm_counter_t _file_rss; mm_counter_t _anon_rss; -- cgit v1.2.3 From b8072f099b7829a6ff3eba618e1d079a81f753f8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH] mm: update comments to pte lock Updated several references to page_table_lock in common code comments. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 38e60a099399..7af8cb836e78 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -47,8 +47,7 @@ struct vm_area_struct; * Locking policy for interlave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on - * mmap_sem. For allocating in the interleave policy the page_table_lock - * must be also aquired to protect il_next. + * mmap_sem. * * Freeing policy: * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. -- cgit v1.2.3 From 4ca644d970bf2542623228a4624af356d20ca267 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:51 -0700 Subject: [PATCH] memory hotplug prep: __section_nr helper A little helper that we use in the hotplug code. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7519eb4191e7..4674145bb63d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -509,6 +509,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } +extern int __section_nr(struct mem_section* ms); /* * We use the lower bits of the mem_map pointer to store -- cgit v1.2.3 From 208d54e5513c0c02d85af0990901354c74364d5c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:52 -0700 Subject: [PATCH] memory hotplug locking: node_size_lock pgdat->node_size_lock is basically only neeeded in one place in the normal code: show_mem(), which is the arch-specific sysrq-m printing function. Strictly speaking, the architectures not doing memory hotplug do no need this locking in show_mem(). However, they are all included for completeness. This should also make any future consolidation of all of the implementations a little more straightforward. This lock is also held in the sparsemem code during a memory removal, as sections are invalidated. This is the place there pfn_valid() is made false for a memory area that's being removed. The lock is only required when doing pfn_valid() operations on memory which the user does not already have a reference on the page, such as in show_mem(). Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 12 ++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 include/linux/memory_hotplug.h (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h new file mode 100644 index 000000000000..e8103be9d528 --- /dev/null +++ b/include/linux/memory_hotplug.h @@ -0,0 +1,34 @@ +#ifndef __LINUX_MEMORY_HOTPLUG_H +#define __LINUX_MEMORY_HOTPLUG_H + +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} +#endif +#endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4674145bb63d..e050d68963a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -273,6 +273,16 @@ typedef struct pglist_data { struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -293,6 +303,8 @@ typedef struct pglist_data { #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) +#include + extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, -- cgit v1.2.3 From bdc8cb984576ab5b550c8b24c6fa111a873503e3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:53 -0700 Subject: [PATCH] memory hotplug locking: zone span seqlock See the "fixup bad_range()" patch for more information, but this actually creates a the lock to protect things making assumptions about a zone's size staying constant at runtime. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/mmzone.h | 15 +++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e8103be9d528..4b08bc947578 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,13 +16,36 @@ void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) static inline void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) { - spin_lock_irqrestore(&pgdat->node_size_lock, *flags); + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); } static inline void pgdat_resize_init(struct pglist_data *pgdat) { spin_lock_init(&pgdat->node_size_lock); } +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -30,5 +53,17 @@ void pgdat_resize_init(struct pglist_data *pgdat) static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} static inline void pgdat_resize_init(struct pglist_data *pgdat) {} -#endif + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e050d68963a1..f5fa3082fd6a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -12,6 +12,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -137,6 +138,10 @@ struct zone { * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +225,16 @@ struct zone { /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ -- cgit v1.2.3 From 3947be1969a9ce455ec30f60ef51efb10e4323d1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:54 -0700 Subject: [PATCH] memory hotplug: sysfs and add/remove functions This adds generic memory add/remove and supporting functions for memory hotplug into a new file as well as a memory hotplug kernel config option. Individual architecture patches will follow. For now, disable memory hotplug when swsusp is enabled. There's a lot of churn there right now. We'll fix it up properly once it calms down. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 94 ++++++++++++++++++++++++++++++++++++++++++ include/linux/memory_hotplug.h | 35 ++++++++++++++++ include/linux/mm.h | 1 + 3 files changed, 130 insertions(+) create mode 100644 include/linux/memory.h (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h new file mode 100644 index 000000000000..0def328ab5cf --- /dev/null +++ b/include/linux/memory.h @@ -0,0 +1,94 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* to which fru does this belong? */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< #include +#include +#include #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -46,6 +48,19 @@ static inline void zone_seqlock_init(struct zone *zone) { seqlock_init(&zone->span_seqlock); } +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -65,5 +80,25 @@ static inline int zone_span_seqretry(struct zone *zone, unsigned iv) static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} #endif /* ! CONFIG_MEMORY_HOTPLUG */ +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); + dump_stack(); + return -ENOSYS; +} #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a514eca40d5..5c1fb0a2e806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -840,6 +840,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); -- cgit v1.2.3