summaryrefslogtreecommitdiff
AgeCommit message (Collapse)Author
2006-12-07[PATCH] Save some bytes in struct mm_structArnaldo Carvalho de Melo
Before: [acme@newtoy net-2.6.20]$ pahole --cacheline 32 kernel/sched.o mm_struct /* include2/asm/processor.h:542 */ struct mm_struct { struct vm_area_struct * mmap; /* 0 4 */ struct rb_root mm_rb; /* 4 4 */ struct vm_area_struct * mmap_cache; /* 8 4 */ long unsigned int (*get_unmapped_area)(); /* 12 4 */ void (*unmap_area)(); /* 16 4 */ long unsigned int mmap_base; /* 20 4 */ long unsigned int task_size; /* 24 4 */ long unsigned int cached_hole_size; /* 28 4 */ /* ---------- cacheline 1 boundary ---------- */ long unsigned int free_area_cache; /* 32 4 */ pgd_t * pgd; /* 36 4 */ atomic_t mm_users; /* 40 4 */ atomic_t mm_count; /* 44 4 */ int map_count; /* 48 4 */ struct rw_semaphore mmap_sem; /* 52 64 */ spinlock_t page_table_lock; /* 116 40 */ struct list_head mmlist; /* 156 8 */ mm_counter_t _file_rss; /* 164 4 */ mm_counter_t _anon_rss; /* 168 4 */ long unsigned int hiwater_rss; /* 172 4 */ long unsigned int hiwater_vm; /* 176 4 */ long unsigned int total_vm; /* 180 4 */ long unsigned int locked_vm; /* 184 4 */ long unsigned int shared_vm; /* 188 4 */ /* ---------- cacheline 6 boundary ---------- */ long unsigned int exec_vm; /* 192 4 */ long unsigned int stack_vm; /* 196 4 */ long unsigned int reserved_vm; /* 200 4 */ long unsigned int def_flags; /* 204 4 */ long unsigned int nr_ptes; /* 208 4 */ long unsigned int start_code; /* 212 4 */ long unsigned int end_code; /* 216 4 */ long unsigned int start_data; /* 220 4 */ /* ---------- cacheline 7 boundary ---------- */ long unsigned int end_data; /* 224 4 */ long unsigned int start_brk; /* 228 4 */ long unsigned int brk; /* 232 4 */ long unsigned int start_stack; /* 236 4 */ long unsigned int arg_start; /* 240 4 */ long unsigned int arg_end; /* 244 4 */ long unsigned int env_start; /* 248 4 */ long unsigned int env_end; /* 252 4 */ /* ---------- cacheline 8 boundary ---------- */ long unsigned int saved_auxv[44]; /* 256 176 */ unsigned int dumpable:2; /* 432 4 */ cpumask_t cpu_vm_mask; /* 436 4 */ mm_context_t context; /* 440 68 */ long unsigned int swap_token_time; /* 508 4 */ /* ---------- cacheline 16 boundary ---------- */ char recent_pagein; /* 512 1 */ /* XXX 3 bytes hole, try to pack */ int core_waiters; /* 516 4 */ struct completion * core_startup_done; /* 520 4 */ struct completion core_done; /* 524 52 */ rwlock_t ioctx_list_lock; /* 576 36 */ struct kioctx * ioctx_list; /* 612 4 */ }; /* size: 616, sum members: 613, holes: 1, sum holes: 3, cachelines: 20, last cacheline: 8 bytes */ After: [acme@newtoy net-2.6.20]$ pahole --cacheline 32 kernel/sched.o mm_struct /* include2/asm/processor.h:542 */ struct mm_struct { struct vm_area_struct * mmap; /* 0 4 */ struct rb_root mm_rb; /* 4 4 */ struct vm_area_struct * mmap_cache; /* 8 4 */ long unsigned int (*get_unmapped_area)(); /* 12 4 */ void (*unmap_area)(); /* 16 4 */ long unsigned int mmap_base; /* 20 4 */ long unsigned int task_size; /* 24 4 */ long unsigned int cached_hole_size; /* 28 4 */ /* ---------- cacheline 1 boundary ---------- */ long unsigned int free_area_cache; /* 32 4 */ pgd_t * pgd; /* 36 4 */ atomic_t mm_users; /* 40 4 */ atomic_t mm_count; /* 44 4 */ int map_count; /* 48 4 */ struct rw_semaphore mmap_sem; /* 52 64 */ spinlock_t page_table_lock; /* 116 40 */ struct list_head mmlist; /* 156 8 */ mm_counter_t _file_rss; /* 164 4 */ mm_counter_t _anon_rss; /* 168 4 */ long unsigned int hiwater_rss; /* 172 4 */ long unsigned int hiwater_vm; /* 176 4 */ long unsigned int total_vm; /* 180 4 */ long unsigned int locked_vm; /* 184 4 */ long unsigned int shared_vm; /* 188 4 */ /* ---------- cacheline 6 boundary ---------- */ long unsigned int exec_vm; /* 192 4 */ long unsigned int stack_vm; /* 196 4 */ long unsigned int reserved_vm; /* 200 4 */ long unsigned int def_flags; /* 204 4 */ long unsigned int nr_ptes; /* 208 4 */ long unsigned int start_code; /* 212 4 */ long unsigned int end_code; /* 216 4 */ long unsigned int start_data; /* 220 4 */ /* ---------- cacheline 7 boundary ---------- */ long unsigned int end_data; /* 224 4 */ long unsigned int start_brk; /* 228 4 */ long unsigned int brk; /* 232 4 */ long unsigned int start_stack; /* 236 4 */ long unsigned int arg_start; /* 240 4 */ long unsigned int arg_end; /* 244 4 */ long unsigned int env_start; /* 248 4 */ long unsigned int env_end; /* 252 4 */ /* ---------- cacheline 8 boundary ---------- */ long unsigned int saved_auxv[44]; /* 256 176 */ cpumask_t cpu_vm_mask; /* 432 4 */ mm_context_t context; /* 436 68 */ long unsigned int swap_token_time; /* 504 4 */ char recent_pagein; /* 508 1 */ unsigned char dumpable:2; /* 509 1 */ /* XXX 2 bytes hole, try to pack */ int core_waiters; /* 512 4 */ struct completion * core_startup_done; /* 516 4 */ struct completion core_done; /* 520 52 */ rwlock_t ioctx_list_lock; /* 572 36 */ struct kioctx * ioctx_list; /* 608 4 */ }; /* size: 612, sum members: 610, holes: 1, sum holes: 2, cachelines: 20, last cacheline: 4 bytes */ [acme@newtoy net-2.6.20]$ codiff -V /tmp/sched.o.before kernel/sched.o /pub/scm/linux/kernel/git/acme/net-2.6.20/kernel/sched.c: struct mm_struct | -4 dumpable:2; from: unsigned int /* 432(30) 4(2) */ to: unsigned char /* 509(6) 1(2) */ < SNIP other offset changes > 1 struct changed [acme@newtoy net-2.6.20]$ I'm not aware of any problem about using 2 byte wide bitfields where previously a 4 byte wide one was, holler if there is any, I wouldn't be surprised, bitfields are things from hell. For the curious, 432(30) means: at offset 432 from the struct start, at offset 30 in the bitfield (yeah, it comes backwards, hellish, huh?) ditto for 509(6), while 4(2) and 1(2) means "struct field size(bitfield size)". Now we have a 2 bytes hole and are using only 4 bytes of the last 32 bytes cacheline, any takers? :-) Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: make compound page destructor handling explicitAndy Whitcroft
Currently we we use the lru head link of the second page of a compound page to hold its destructor. This was ok when it was purely an internal implmentation detail. However, hugetlbfs overrides this destructor violating the layering. Abstract this out as explicit calls, also introduce a type for the callback function allowing them to be type checked. For each callback we pre-declare the function, causing a type error on definition rather than on use elsewhere. [akpm@osdl.org: cleanups] Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: better fallback allocation behaviorChristoph Lameter
Currently we simply attempt to allocate from all allowed nodes using GFP_THISNODE. However, GFP_THISNODE does not do reclaim (it wont do any at all if the recent GFP_THISNODE patch is accepted). If we truly run out of memory in the whole system then fallback_alloc may return NULL although memory may still be available if we would perform more thorough reclaim. This patch changes fallback_alloc() so that we first only inspect all the per node queues for available slabs. If we find any then we allocate from those. This avoids slab fragmentation by first getting rid of all partial allocated slabs on every node before allocating new memory. If we cannot satisfy the allocation from any per node queue then we extend a slab. We now call into the page allocator without specifying GFP_THISNODE. The page allocator will then implement its own fallback (in the given cpuset context), perform necessary reclaim (again considering not a single node but the whole set of allowed nodes) and then return pages for a new slab. We identify from which node the pages were allocated and then insert the pages into the corresponding per node structure. In order to do so we need to modify cache_grow() to take a parameter that specifies the new slab. kmem_getpages() can no longer set the GFP_THISNODE flag since we need to be able to use kmem_getpage to allocate from an arbitrary node. GFP_THISNODE needs to be specified when calling cache_grow(). One key advantage is that the decision from which node to allocate new memory is removed from slab fallback processing. The patch allows to go back to use of the page allocators fallback/reclaim logic. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] GFP_THISNODE must not trigger global reclaimChristoph Lameter
The intent of GFP_THISNODE is to make sure that an allocation occurs on a particular node. If this is not possible then NULL needs to be returned so that the caller can choose what to do next on its own (the slab allocator depends on that). However, GFP_THISNODE currently triggers reclaim before returning a failure (GFP_THISNODE means GFP_NORETRY is set). If we have over allocated a node then we will currently do some reclaim before returning NULL. The caller may want memory from other nodes before reclaim should be triggered. (If the caller wants reclaim then he can directly use __GFP_THISNODE instead). There is no flag to avoid reclaim in the page allocator and adding yet another GFP_xx flag would be difficult given that we are out of available flags. So just compare and see if all bits for GFP_THISNODE (__GFP_THISNODE, __GFP_NORETRY and __GFP_NOWARN) are set. If so then we return NULL before waking up kswapd. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: fix two issues in kmalloc_node / __cache_alloc_nodeChristoph Lameter
This addresses two issues: 1. Kmalloc_node() may intermittently return NULL if we are allocating from the current node and are unable to obtain memory for the current node from the page allocator. This is because we call ___cache_alloc() if nodeid == numa_node_id() and ____cache_alloc is not able to fallback to other nodes. This was introduced in the 2.6.19 development cycle. <= 2.6.18 in that case does not do a restricted allocation and blindly trusts the page allocator to have given us memory from the indicated node. It inserts the page regardless of the node it came from into the queues for the current node. 2. If kmalloc_node() is used on a node that has not been bootstrapped yet then we may try to pass an invalid node number to ____cache_alloc_node() triggering a BUG(). Change the function to call fallback_alloc() instead. Only call fallback_alloc() if we are allowed to fallback at all. The need to handle a node not bootstrapped yet also first surfaced in the 2.6.19 cycle. Update the comments since they were still describing the old kmalloc_node from 2.6.12. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: deprecate kmem_cache_tAndrew Morton
Cc: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove kmem_cache_tChristoph Lameter
Replace all uses of kmem_cache_t with struct kmem_cache. The patch was generated using the following script: #!/bin/sh # # Replace one string by another in all the kernel sources. # set -e for file in `find * -name "*.c" -o -name "*.h"|xargs grep -l $1`; do quilt add $file sed -e "1,\$s/$1/$2/g" $file >/tmp/$$ mv /tmp/$$ $file quilt refresh done The script was run like this sh replace kmem_cache_t "struct kmem_cache" Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_DMAChristoph Lameter
SLAB_DMA is an alias of GFP_DMA. This is the last one so we remove the leftover comment too. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_KERNELChristoph Lameter
SLAB_KERNEL is an alias of GFP_KERNEL. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_ATOMICChristoph Lameter
SLAB_ATOMIC is an alias of GFP_ATOMIC Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_USERChristoph Lameter
SLAB_USER is an alias of GFP_USER Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_NOFSChristoph Lameter
SLAB_NOFS is an alias of GFP_NOFS. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_NOIOChristoph Lameter
SLAB_NOIO is an alias of GFP_NOIO with a single instance of use. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_LEVEL_MASKChristoph Lameter
SLAB_LEVEL_MASK is only used internally to the slab and is and alias of GFP_LEVEL_MASK. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab: remove SLAB_NO_GROWChristoph Lameter
It is only used internally in the slab. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] kill install_file_pte's pte_valHugh Dickins
David Binderman and his Intel C compiler rightly observe that install_file_pte no longer has any use for its pte_val. Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: d binderman <dcb314@hotmail.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: cleanup indentation on switch for CPU operationsAndy Whitcroft
These patches introduced new switch statements which are indented contrary to the concensus in mm/*.c. Fix them up to match that concensus. [PATCH] node local per-cpu-pages [PATCH] ZVC: Scale thresholds depending on the size of the system commit e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a commit df9ecaba3f152d1ea79f2a5e0b87505e03f47590 Signed-off-by: Andy Whitcroft <apw@shadowen.org> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] reject corrupt swapfiles earlierEric Sandeen
The fsfuzzer found this; with a corrupt small swapfile that claims to have many pages: [root]# file swap.741.img swap.741.img: Linux/i386 swap file (new style) 1 (4K pages) size 1040191487 pages [root]# ls -l swap.741.img -rw-r--r-- 1 root root 16777216 Nov 22 05:18 swap.741.img sys_swapon() will try to vmalloc all those pages, and -then- check to see if the file is actually that large: if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { <snip> if (swapfilesize && maxpages > swapfilesize) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); It seems to me that it would make more sense to move this test up before the vmalloc, with the other checks, to avoid the OOM-killer in this situation... Signed-off-by: Eric Sandeen <sandeen@redhat.com> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] silence unused pgdat warning from alloc_bootmem_node and friendsAndy Whitcroft
x86 NUMA systems only define bootmem for node 0. alloc_bootmem_node() and friends therefore ignore the passed pgdat and use NODE_DATA(0) in all cases. This leads to the following warnings as we are not using the passed parameter: .../mm/page_alloc.c: In function 'zone_wait_table_init': .../mm/page_alloc.c:2259: warning: unused variable 'pgdat' One option would be to define all variables used with these macros __attribute__ ((unused)), but this would leave us exposed should these become genuinely unused. The key here is that we _are_ using the value, we ignore it but that is a deliberate action. This patch adds a nested local variable within the alloc_bootmem_node helper to which the pgdat parameter is assigned making it 'used'. The nested local is marked __attribute__ ((unused)) to silence this same warning for it. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] numa node ids are int, page_to_nid and zone_to_nid should return intAndy Whitcroft
NUMA node ids are passed as either int or unsigned int almost exclusivly page_to_nid and zone_to_nid both return unsigned long. This is a throw back to when page_to_nid was a #define and was thus exposing the real type of the page flags field. In addition to fixing up the definitions of page_to_nid and zone_to_nid I audited the users of these functions identifying the following incorrect uses: 1) mm/page_alloc.c show_node() -- printk dumping the node id, 2) include/asm-ia64/pgalloc.h pgtable_quicklist_free() -- comparison against numa_node_id() which returns an int from cpu_to_node(), and 3) mm/mpolicy.c check_pte_range -- used as an index in node_isset which uses bit_set which in generic code takes an int. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] drain_node_page(): Drain pages in batch unitsChristoph Lameter
drain_node_pages() currently drains the complete pageset of all pages. If there are a large number of pages in the queues then we may hold off interrupts for too long. Duplicate the method used in free_hot_cold_page. Only drain pcp->batch pages at one time. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Remove uses of kmem_cache_t from mm/* and include/linux/slab.hChristoph Lameter
Remove all uses of kmem_cache_t (the most were left in slab.h). The typedef for kmem_cache_t is then only necessary for other kernel subsystems. Add a comment to that effect. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move names_cachep to linux/fs.hChristoph Lameter
The names_cachep is used for getname() and putname(). So lets put it into fs.h near those two definitions. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move fs_cachep to linux/fs_struct.hChristoph Lameter
fs_cachep is only used in kernel/exit.c and in kernel/fork.c. It is used to store fs_struct items so it should be placed in linux/fs_struct.h Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move filep_cachep to include/file.hChristoph Lameter
filp_cachep is only used in fs/file_table.c and in fs/dcache.c where it is defined. Move it to related definitions in linux/file.h. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move files_cachep to include/file.hChristoph Lameter
Proper place is in file.h since files_cachep uses are rated to file I/O. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move vm_area_cachep to include/mm.hChristoph Lameter
vm_area_cachep is used to store vm_area_structs. So move to mm.h. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Move sighand_cachep to include/signal.hChristoph Lameter
Move sighand_cachep definitioni to linux/signal.h The sighand cache is only used in fs/exec.c and kernel/fork.c. It is defined in kernel/fork.c but only used in fs/exec.c. The sighand_cachep is related to signal processing. So add the definition to signal.h. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Remove bio_cachep from slab.hChristoph Lameter
Remove bio_cachep from slab.h - it no longer exists. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] make mm/thrash.c:global_faults staticAdrian Bunk
This patch makes the needlessly global "global_faults" static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] enable booting a NUMA system where some nodes have no memoryChristian Krafft
When booting a NUMA system with nodes that have no memory (eg by limiting memory), bootmem_alloc_core tried to find pages in an uninitialized bootmem_map. This caused a null pointer access. This fix adds a check, so that NULL is returned. That will enable the caller (bootmem_alloc_nopanic) to alloc memory on other without a panic. Signed-off-by: Christian Krafft <krafft@de.ibm.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Andy Whitcroft <apw@shadowen.org> Cc: Martin Bligh <mbligh@google.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Allow NULL pointers in percpu_freeAlan Stern
The patch (as824b) makes percpu_free() ignore NULL arguments, as one would expect for a deallocation routine. (Note that free_percpu is #defined as percpu_free in include/linux/percpu.h.) A few callers are updated to remove now-unneeded tests for NULL. A few other callers already seem to assume that passing a NULL pointer to percpu_free() is okay! The patch also removes an unnecessary NULL check in percpu_depopulate(). Signed-off-by: Alan Stern <stern@rowland.harvard.edu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] node-aware skb allocationChristoph Hellwig
Node-aware allocation of skbs for the receive path. Details: - __alloc_skb gets a new node argument and cals the node-aware slab functions with it. - netdev_alloc_skb passed the node number it gets from dev_to_node to it, everyone else passes -1 (any node) Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] add numa node information to struct deviceChristoph Hellwig
For node-aware skb allocations we need information about the node in struct net_device or struct device. Davem suggested to put it into struct device which this patch does. In particular: - struct device gets a new int numa_node member if CONFIG_NUMA is set - there are two new helpers, dev_to_node and set_dev_node to transparently deal with the non-numa case - for pci devices the node-info is set to the value we get from pcibus_to_node. Note that for some architectures pcibus_to_node doesn't work yet at the time we call it currently. This is harmless and will just mean skb allocations aren't node-local on this architectures until the implementation of pcibus_to_node on these architectures have been updated (There are patches for x86 and x86_64 floating around) [akpm@osdl.org: cleanup] Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] leak tracking for kmalloc_nodeChristoph Hellwig
We have variants of kmalloc and kmem_cache_alloc that leave leak tracking to the caller. This is used for subsystem-specific allocators like skb_alloc. To make skb_alloc node-aware we need similar routines for the node-aware slab allocator, which this patch adds. Note that the code is rather ugly, but it mirrors the non-node-aware code 1:1: [akpm@osdl.org: add module export] Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Always print out the header line in /proc/swapsSuleiman Souhlal
It would be possible for /proc/swaps to not always print out the header: swapon /dev/hdc2 swapon /dev/hde2 swapoff /dev/hdc2 At this point /proc/swaps would not have a header. Signed-off-by: Suleiman Souhlal <suleiman@google.com> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] OOM can panic due to processes stuck in __alloc_pages()Kirill Korotaev
OOM can panic due to the processes stuck in __alloc_pages() doing infinite rebalance loop while no memory can be reclaimed. OOM killer tries to kill some processes, but unfortunetaly, rebalance label was moved by someone below the TIF_MEMDIE check, so buddy allocator doesn't see that process is OOM-killed and it can simply fail the allocation :/ Observed in reality on RHEL4(2.6.9)+OpenVZ kernel when a user doing some memory allocation tricks triggered OOM panic. Signed-off-by: Denis Lunev <den@sw.ru> Signed-off-by: Kirill Korotaev <dev@openvz.org> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mlock cleanupRik Bobbaers
mm is defined as vma->vm_mm, so use that. Acked-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Allow user processes to raise their oom_adj valueGuillem Jover
Currently a user process cannot rise its own oom_adj value (i.e. unprotecting itself from the OOM killer). As this value is stored in the task structure it gets inherited and the unprivileged childs will be unable to rise it. The EPERM will be handled by the generic proc fs layer, as only processes with the proper caps or the owner of the process will be able to write to the file. So we allow only the processes with CAP_SYS_RESOURCE to lower the value, otherwise it will get an EACCES which seems more appropriate than EPERM. Signed-off-by: Guillem Jover <guillem.jover@nokia.com> Acked-by: Andrea Arcangeli <andrea@novell.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] Fix kunmap_atomic's use of kpte_clear_flush()Jeremy Fitzhardinge
kunmap_atomic() will call kpte_clear_flush with vaddr/ptep arguments which don't correspond if the vaddr is just a normal lowmem address (ie, not in the KMAP area). This patch makes sure that the pte is only cleared if kmap area was actually used for the mapping. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Zachary Amsden <zach@vmware.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: k{,um}map_atomic() vs in_atomic()Peter Zijlstra
Make kmap_atomic/kunmap_atomic denote a pagefault disabled scope. All non trivial implementations already do this anyway. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: pagefault_{disable,enable}()Peter Zijlstra
Introduce pagefault_{disable,enable}() and use these where previously we did manual preempt increments/decrements to make the pagefault handler do the atomic thing. Currently they still rely on the increased preempt count, but do not rely on the disabled preemption, this might go away in the future. (NOTE: the extra barrier() in pagefault_disable might fix some holes on machines which have too many registers for their own good) [heiko.carstens@de.ibm.com: s390 fix] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Nick Piggin <npiggin@suse.de> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: arch do_page_fault() vs in_atomic()Peter Zijlstra
In light of the recent pagefault and filemap_copy_from_user work I've gone through all the arch pagefault handlers to make sure the inc_preempt_count() 'feature' works as expected. Several sections of code (including the new filemap_copy_from_user) rely on the fact that faults do not take locks under increased preempt count. arch/x86_64 - good arch/powerpc - good arch/cris - fixed arch/i386 - good arch/parisc - fixed arch/sh - good arch/sparc - good arch/s390 - good arch/m68k - fixed arch/ppc - good arch/alpha - fixed arch/mips - good arch/sparc64 - good arch/ia64 - good arch/arm - fixed arch/um - good arch/avr32 - good arch/h8300 - NA arch/m32r - good arch/v850 - good arch/frv - fixed arch/m68knommu - NA arch/arm26 - fixed arch/sh64 - fixed arch/xtensa - good Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: add noaliencache boot option to disable numa alien cachesPaul Menage
When using numa=fake on non-NUMA hardware there is no benefit to having the alien caches, and they consume much memory. Add a kernel boot option to disable them. Christoph sayeth "This is good to have even on large NUMA. The problem is that the alien caches grow by the square of the size of the system in terms of nodes." Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Manfred Spraul <manfred@colorfullife.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: slab: eliminate lock_cpu_hotplug from slabRavikiran G Thirumalai
Here's an attempt towards doing away with lock_cpu_hotplug in the slab subsystem. This approach also fixes a bug which shows up when cpus are being offlined/onlined and slab caches are being tuned simultaneously. http://marc.theaimsgroup.com/?l=linux-kernel&m=116098888100481&w=2 The patch has been stress tested overnight on a 2 socket 4 core AMD box with repeated cpu online and offline, while dbench and kernbench process are running, and slab caches being tuned at the same time. There were no lockdep warnings either. (This test on 2,6.18 as 2.6.19-rc crashes at __drain_pages http://marc.theaimsgroup.com/?l=linux-kernel&m=116172164217678&w=2 ) The approach here is to hold cache_chain_mutex from CPU_UP_PREPARE until CPU_ONLINE (similar in approach as worqueue_mutex) . Slab code sensitive to cpu_online_map (kmem_cache_create, kmem_cache_destroy, slabinfo_write, __cache_shrink) is already serialized with cache_chain_mutex. (This patch lengthens cache_chain_mutex hold time at kmem_cache_destroy to cover this). This patch also takes the cache_chain_sem at kmem_cache_shrink to protect sanity of cpu_online_map at __cache_shrink, as viewed by slab. (kmem_cache_shrink->__cache_shrink->drain_cpu_caches). But, really, kmem_cache_shrink is used at just one place in the acpi subsystem! Do we really need to keep kmem_cache_shrink at all? Another note. Looks like a cpu hotplug event can send CPU_UP_CANCELED to a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE. This could be due to a subsystem registered for notification earlier than the current subsystem crapping out with NOTIFY_BAD. Badness can occur with in the CPU_UP_CANCELED code path at slab if this happens (The same would apply for workqueue.c as well). To overcome this, we might have to use either a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was using in his experiments, or c) Do not send CPU_UP_CANCELED to a subsystem which did not receive CPU_UP_PREPARE. I would prefer c). Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org> Signed-off-by: Shai Fultheim <shai@scalex86.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] slab debug and ARCH_SLAB_MINALIGN don't get alongKevin Hilman
When CONFIG_SLAB_DEBUG is used in combination with ARCH_SLAB_MINALIGN, some debug flags should be disabled which depend on BYTES_PER_WORD alignment. The disabling of these debug flags is not properly handled when BYTES_PER_WORD < ARCH_SLAB_MEMALIGN < cache_line_size() This patch fixes that and also adds an alignment check to cache_alloc_debugcheck_after() when ARCH_SLAB_MINALIGN is used. Signed-off-by: Kevin Hilman <khilman@mvista.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Manfred Spraul <manfred@colorfullife.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] htlb forget rss with pt sharingChen, Kenneth W
Imprecise RSS accounting is an irritating ill effect with pt sharing. After consulted with several VM experts, I have tried various methods to solve that problem: (1) iterate through all mm_structs that share the PT and increment count; (2) keep RSS count in page table structure and then sum them up at reporting time. None of the above methods yield any satisfactory implementation. Since process RSS accounting is pure information only, I propose we don't count them at all for hugetlb page. rlimit has such field, though there is absolutely no enforcement on limiting that resource. One other method is to account all RSS at hugetlb mmap time regardless they are faulted or not. I opt for the simplicity of no accounting at all. Hugetlb page are special, they are reserved up front in global reservation pool and is not reclaimable. From physical memory resource point of view, it is already consumed regardless whether there are users using them. If the concern is that RSS can be used to control resource allocation, we already can specify hugetlb fs size limit and sysadmin can enforce that at mount time. Combined with the two points mentioned above, I fail to see if there is anything got affected because of this patch. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Hugh Dickins <hugh@veritas.com> Cc: Dave McCracken <dmccr@us.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Adam Litke <agl@us.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] shared page table for hugetlb pageChen, Kenneth W
Following up with the work on shared page table done by Dave McCracken. This set of patch target shared page table for hugetlb memory only. The shared page table is particular useful in the situation of large number of independent processes sharing large shared memory segments. In the normal page case, the amount of memory saved from process' page table is quite significant. For hugetlb, the saving on page table memory is not the primary objective (as hugetlb itself already cuts down page table overhead significantly), instead, the purpose of using shared page table on hugetlb is to allow faster TLB refill and smaller cache pollution upon TLB miss. With PT sharing, pte entries are shared among hundreds of processes, the cache consumption used by all the page table is smaller and in return, application gets much higher cache hit ratio. One other effect is that cache hit ratio with hardware page walker hitting on pte in cache will be higher and this helps to reduce tlb miss latency. These two effects contribute to higher application performance. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Hugh Dickins <hugh@veritas.com> Cc: Dave McCracken <dmccr@us.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Adam Litke <agl@us.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] balance_pdgat() cleanupAndrew Morton
Despaghettify balance_pdgat() a bit. Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07[PATCH] mm: add arch_alloc_pageNick Piggin
Add an arch_alloc_page to match arch_free_page. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>