From c757bea93bea4b77ebd181cc6dca60c15e3b1a2c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Dec 2009 22:35:16 -0500 Subject: tracing: Fix setting tracer specific options The function __set_tracer_option() takes as its last parameter a "neg" value. If set it should negate the value of the option. The trace_options_write() passed the value written to the file which is what the new value needs to be set as. But since this is not the negative, it never sets the value. Reported-by: Peter Zijlstra Cc: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee61915935d5..d0a4c12d1f1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3949,7 +3949,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); ret = __set_tracer_option(current_trace, topt->flags, - topt->opt, val); + topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; -- cgit v1.2.3 From 4440095c8268c1a5e11577097d2be429cec036ca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Dec 2009 21:00:20 +0100 Subject: SYSCTL: Print binary sysctl warnings (nearly) only once When printing legacy sysctls print the warning message for each of them only once. This way there is a guarantee the syslog won't be flooded for any sane program. The original attempt at this made the tables non const and stored the flag inline. Linus suggested using a separate hash table for this, this is based on a code snippet from him. The hash implies this is not exact and can sometimes not print a new sysctl due to a hash collision, but in practice this should not be a problem I used a FNV32 hash over the binary string with a 32byte bitmap. This gives relatively little collisions when all the predefined binary sysctls are hashed: size 256 bucket length number 0: [25] 1: [67] 2: [88] 3: [47] 4: [22] 5: [6] 6: [1] The worst case is a single collision of 6 hash values. Signed-off-by: Andi Kleen --- kernel/sysctl_binary.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 112533d5fc08..8f5d16e0707a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1417,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen) return; } +#define WARN_ONCE_HASH_BITS 8 +#define WARN_ONCE_HASH_SIZE (1< Date: Mon, 21 Dec 2009 13:02:24 +0100 Subject: kprobes: Fix distinct type warning Every time I see this: kernel/kprobes.c: In function 'register_kretprobe': kernel/kprobes.c:1038: warning: comparison of distinct pointer types lacks a cast I'm wondering if something changed in common code and we need to do something for s390. Apparently that's not the case. Let's get rid of this annoying warning. Signed-off-by: Heiko Carstens Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu LKML-Reference: <20091221120224.GA4471@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5342a344c43..b7df302a0204 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * num_possible_cpus()); + rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); #endif -- cgit v1.2.3 From 40892367bc893f3abf6f5ca8ac2ed1c98ba26a77 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Dec 2009 12:01:17 -0800 Subject: tracing: Kconfig spelling fixes and cleanups Fix filename reference (ftrace-implementation.txt -> ftrace-design.txt). Fix spelling, punctuation, grammar. Fix help text indentation and line lengths to reduce need for horizontal scrolling or larger window sizes. Signed-off-by: Randy Dunlap Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091221120117.3fb49cdc.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 112 +++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d006554888dc..6c22d8a2f289 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,17 +12,17 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -34,17 +34,17 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_DYNAMIC_FTRACE bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FTRACE_MCOUNT_RECORD bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_HW_BRANCH_TRACER bool @@ -52,7 +52,7 @@ config HAVE_HW_BRANCH_TRACER config HAVE_SYSCALL_TRACEPOINTS bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config TRACER_MAX_TRACE bool @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options. +# hiding of the automatic options. config TRACING bool @@ -119,7 +119,7 @@ menuconfig FTRACE bool "Tracers" default y if DEBUG_KERNEL help - Enable the kernel tracing infrastructure. + Enable the kernel tracing infrastructure. if FTRACE @@ -133,7 +133,7 @@ config FUNCTION_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation - instruction to the beginning of every kernel function, which NOP + instruction at the beginning of every kernel function, which NOP sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very @@ -150,7 +150,7 @@ config FUNCTION_GRAPH_TRACER and its entry. Its first purpose is to trace the duration of functions and draw a call graph for each thread with some information like - the return value. This is done by setting the current return + the return value. This is done by setting the current return address on the current task structure into a stack of calls. @@ -173,7 +173,7 @@ config IRQSOFF_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be used together or separately.) @@ -186,7 +186,7 @@ config PREEMPT_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP help - This option measures the time spent in preemption off critical + This option measures the time spent in preemption-off critical sections, with microsecond accuracy. The default measurement method is a maximum search, which is @@ -195,7 +195,7 @@ config PREEMPT_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be used together or separately.) @@ -222,7 +222,7 @@ config ENABLE_DEFAULT_TRACERS depends on !GENERIC_TRACER select TRACING help - This tracer hooks to various trace points in the kernel + This tracer hooks to various trace points in the kernel, allowing the user to pick and choose which trace point they want to trace. It also includes the sched_switch tracer plugin. @@ -265,19 +265,19 @@ choice The likely/unlikely profiler only looks at the conditions that are annotated with a likely or unlikely macro. - The "all branch" profiler will profile every if statement in the + The "all branch" profiler will profile every if-statement in the kernel. This profiler will also enable the likely/unlikely - profiler as well. + profiler. - Either of the above profilers add a bit of overhead to the system. - If unsure choose "No branch profiling". + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". config BRANCH_PROFILE_NONE bool "No branch profiling" help - No branch profiling. Branch profiling adds a bit of overhead. - Only enable it if you want to analyse the branching behavior. - Otherwise keep it disabled. + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" @@ -288,7 +288,7 @@ config PROFILE_ANNOTATED_BRANCHES /sys/kernel/debug/tracing/profile_annotated_branch - Note: this will add a significant overhead, only turn this + Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES @@ -305,7 +305,7 @@ config PROFILE_ALL_BRANCHES This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system - is to be analyzed + is to be analyzed in much detail. endchoice config TRACING_BRANCHES @@ -335,7 +335,7 @@ config POWER_TRACER depends on X86 select GENERIC_TRACER help - This tracer helps developers to analyze and optimize the kernels + This tracer helps developers to analyze and optimize the kernel's power management decisions, specifically the C-state and P-state behavior. @@ -391,14 +391,14 @@ config HW_BRANCH_TRACER select GENERIC_TRACER help This tracer records all branches on the system in a circular - buffer giving access to the last N branches for each cpu. + buffer, giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug @@ -417,15 +417,15 @@ config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER help - The workqueue tracer provides some statistical informations + The workqueue tracer provides some statistical information about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help - to evaluate the amount of work each of them have to perform. + to evaluate the amount of work each of them has to perform. For example it can help a developer to decide whether he should - choose a per cpu workqueue instead of a singlethreaded one. + choose a per-cpu workqueue instead of a singlethreaded one. config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" + bool "Support for tracing block IO actions" depends on SYSFS depends on BLOCK select RELAY @@ -456,15 +456,15 @@ config KPROBE_EVENT select TRACING default y help - This allows the user to add tracing events (similar to tracepoints) on the fly - via the ftrace interface. See Documentation/trace/kprobetrace.txt - for more details. + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. Those events can be inserted wherever kprobes can probe, and record various register and memory values. - This option is also required by perf-probe subcommand of perf tools. If - you want to use perf tools, this option is strongly recommended. + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" @@ -472,32 +472,32 @@ config DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replaces them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise - has native performance as long as no tracing is active. + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A file in the trace_stats - directory called functions, that show the list of functions that - have been hit and their counters. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. - If in doubt, say N + If in doubt, say N. config FTRACE_MCOUNT_RECORD def_bool y @@ -556,8 +556,8 @@ config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER help - This option creates a test to stress the ring buffer and bench mark it. - It creates its own ring buffer such that it will not interfer with + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with any other users of the ring buffer (such as ftrace). It then creates a producer and consumer that will run for 10 seconds and sleep for 10 seconds. Each interval it will print out the number of events @@ -566,7 +566,7 @@ config RING_BUFFER_BENCHMARK It does not disable interrupts or raise its priority, so it may be affected by processes that are running. - If unsure, say N + If unsure, say N. endif # FTRACE -- cgit v1.2.3 From 88f7a890d74137ab0d126a5d65679cd620f1a289 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:22:22 +0800 Subject: ksym_tracer: Fix to make the tracer work ksym tracer doesn't work: # echo tasklist_lock:rw- > ksym_trace_filter -bash: echo: write error: No such device It's because we pass to perf_event_create_kernel_counter() a cpu number which is not present. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF19E.1010201@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 10 +++++++--- kernel/trace/trace_ksym.c | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 366eedf949c0..48fb0bb6992a 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -388,7 +389,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return ERR_PTR(-ENOMEM); - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); @@ -399,18 +401,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, goto fail; } } + put_online_cpus(); return cpu_events; fail: - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); if (IS_ERR(*pevent)) break; unregister_hw_breakpoint(*pevent); } + put_online_cpus(); + free_percpu(cpu_events); - /* return the error if any */ return ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index faf37fa4408c..340b6ff193e0 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) entry->attr.bp_addr = addr; entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - ret = -EAGAIN; entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, ksym_hbp_handler); -- cgit v1.2.3 From 3d13ec2efdb5843ad91e57b60d50b44d922cf063 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:19 +0800 Subject: ksym_tracer: Fix to allow writing newline to ksym_trace_filter It used to work, but now doesn't: # echo > ksym_filter bash: echo: write error: Invalid argument It's caused by d954fbf0ff6b5fdfb32350e85a2f15d3db976506 ("tracing: Fix wrong usage of strstrip in trace_ksyms"). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1D7.5040400@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 340b6ff193e0..160a8d8b37a2 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -299,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file, * 2: echo 0 > ksym_trace_filter * 3: echo "*:---" > ksym_trace_filter */ - if (!buf[0] || !strcmp(buf, "0") || - !strcmp(buf, "*:---")) { + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { __ksym_trace_reset(); ret = 0; goto out; -- cgit v1.2.3 From e6d9491bf8ba6728cc86aeabbc688d20ec0563b5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:40 +0800 Subject: ksym_tracer: Fix race when incrementing count We are under rcu read section but not holding the write lock, so count++ is not atomic. Use atomic64_t instead. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1EC.9010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 160a8d8b37a2..67d79f709fc5 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -32,6 +32,8 @@ #include #include +#include + /* * For now, let us restrict the no. of symbols traced simultaneously to number * of available hardware breakpoint registers. @@ -44,7 +46,7 @@ struct trace_ksym { struct perf_event **ksym_hbp; struct perf_event_attr attr; #ifdef CONFIG_PROFILE_KSYM_TRACER - unsigned long counter; + atomic64_t counter; #endif struct hlist_node ksym_hlist; }; @@ -69,9 +71,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) rcu_read_lock(); hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if ((entry->attr.bp_addr == hbp_hit_addr) && - (entry->counter <= MAX_UL_INT)) { - entry->counter++; + if (entry->attr.bp_addr == hbp_hit_addr) { + atomic64_inc(&entry->counter); break; } } @@ -501,7 +502,8 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-36s", fn_name); else seq_printf(m, " %-36s", ""); - seq_printf(m, " %15lu\n", entry->counter); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); return 0; } -- cgit v1.2.3 From 53ab668064edaeef99c0ee22799483d45f4c81f6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:24:03 +0800 Subject: ksym_tracer: Remove trace_stat trace_stat is problematic. Don't use it, use seqfile instead. This fixes a race that reading the stat file is not protected by any lock, which can lead to use after free. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF203.40200@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 127 ++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 67d79f709fc5..94103cdcf9d8 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -26,7 +26,6 @@ #include #include "trace_output.h" -#include "trace_stat.h" #include "trace.h" #include @@ -444,103 +443,77 @@ struct tracer ksym_tracer __read_mostly = .print_line = ksym_trace_output }; -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - ksym_filter_entry_count = 0; - - entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ksym_trace_filter' file\n"); - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); - - #ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_tracer_stat_headers(struct seq_file *m) +static int ksym_profile_show(struct seq_file *m, void *v) { + struct hlist_node *node; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + seq_puts(m, " Access Type "); seq_puts(m, " Symbol Counter\n"); seq_puts(m, " ----------- "); seq_puts(m, " ------ -------\n"); - return 0; -} -static int ksym_tracer_stat_show(struct seq_file *m, void *v) -{ - struct hlist_node *stat = v; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + access_type = entry->attr.bp_type; - access_type = entry->attr.bp_type; + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", ""); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", ""); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); + rcu_read_unlock(); return 0; } -static void *ksym_tracer_stat_start(struct tracer_stat *trace) +static int ksym_profile_open(struct inode *node, struct file *file) { - return ksym_filter_head.first; -} - -static void * -ksym_tracer_stat_next(void *v, int idx) -{ - struct hlist_node *stat = v; - - return stat->next; + return single_open(file, ksym_profile_show, NULL); } -static struct tracer_stat ksym_tracer_stats = { - .name = "ksym_tracer", - .stat_start = ksym_tracer_stat_start, - .stat_next = ksym_tracer_stat_next, - .stat_headers = ksym_tracer_stat_headers, - .stat_show = ksym_tracer_stat_show +static const struct file_operations ksym_profile_fops = { + .open = ksym_profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; +#endif /* CONFIG_PROFILE_KSYM_TRACER */ -__init static int ksym_tracer_stat_init(void) +__init static int init_ksym_trace(void) { - int ret; + struct dentry *d_tracer; - ret = register_stat_tracer(&ksym_tracer_stats); - if (ret) { - printk(KERN_WARNING "Warning: could not register " - "ksym tracer stats\n"); - return 1; - } + d_tracer = tracing_init_dentry(); - return 0; + trace_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + trace_create_file("ksym_profile", 0444, d_tracer, + NULL, &ksym_profile_fops); +#endif + + return register_tracer(&ksym_tracer); } -fs_initcall(ksym_tracer_stat_init); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ +device_initcall(init_ksym_trace); -- cgit v1.2.3 From 79b408210885b9f7f0b067b07a09d68f4da3a700 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:19 +0800 Subject: tracing/kprobe: Show sign of fields in trace_kprobe format files The format files of trace_kprobe do not show the sign of the fields. The other format files show the field signed type of the fields and this patch makes the trace_kprobe formats consistent with the others. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D27.5040009@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ecab06547a5..83f1e6ef7063 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1182,10 +1182,11 @@ static int __probe_event_show_format(struct trace_seq *s, #undef SHOW_FIELD #define SHOW_FIELD(type, item, name) \ do { \ - ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ - "offset:%u;\tsize:%u;\n", name, \ + ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ + "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type)); \ + (unsigned int)sizeof(type), \ + is_signed_type(type)); \ if (!ret) \ return 0; \ } while (0) -- cgit v1.2.3 From fb7ae981cb9fe8665b9da97e8734745e030c151d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:38 +0800 Subject: tracing: Fix sign fields in ftrace_define_fields_##call() Add is_signed_type() call to trace_define_field() in ftrace macros. The code previously just passed in 0 (false), disregarding whether or not the field was actually a signed type. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3A.6020007@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 458e5bfe26d0..d4fa5dc1ee4e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0, FILTER_OTHER); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), \ container.item), \ - sizeof(field.container.item), 0, \ - FILTER_OTHER); \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -- cgit v1.2.3 From 05cbaa2853cdfc255fdd04e65a82bfe9208c4e52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Dec 2009 16:00:35 +0100 Subject: perf: Fix NULL deref in inheritance code Liming found a NULL deref when a task has a perf context but no counters when it forks. This can occur in two cases, a race during construction where the fork hits after installing the context but before the first counter gets inserted, or more reproducably, a fork after the last counter is closed (which leaves the context around). Reported-by: Wang Liming Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras CC: LKML-Reference: <1262185684.7135.222.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..58ed1dae5875 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5148,7 +5148,7 @@ int perf_event_init_task(struct task_struct *child) GFP_KERNEL); if (!child_ctx) { ret = -ENOMEM; - goto exit; + break; } __perf_event_init_context(child_ctx, child); @@ -5164,7 +5164,7 @@ int perf_event_init_task(struct task_struct *child) } } - if (inherited_all) { + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. @@ -5184,7 +5184,6 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } -exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.2.3 From 10b465aaf9536ee5a16652fa0700740183d48ec9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 19 Dec 2009 14:43:01 +0000 Subject: modules: Skip empty sections when exporting section notes Commit 35dead4 "modules: don't export section names of empty sections via sysfs" changed the set of sections that have attributes, but did not change the iteration over these attributes in add_notes_attrs(). This can lead to add_notes_attrs() creating attributes with the wrong names or with null name pointers. Introduce a sect_empty() function and use it in both add_sect_attrs() and add_notes_attrs(). Reported-by: Martin Michlmayr Signed-off-by: Ben Hutchings Tested-by: Martin Michlmayr Cc: stable@kernel.org Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..f82386bd9ee9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1010,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet */ #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} + struct module_sect_attr { struct module_attribute mattr; @@ -1051,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC - && sechdrs[i].sh_size) + if (!sect_empty(&sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1070,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { - if (! (sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!sechdrs[i].sh_size) + if (sect_empty(&sechdrs[i])) continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, @@ -1156,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; for (i = 0; i < nsect; i++) - if ((sechdrs[i].sh_flags & SHF_ALLOC) && + if (!sect_empty(&sechdrs[i]) && (sechdrs[i].sh_type == SHT_NOTE)) ++notes; @@ -1172,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < nsect; ++i) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { nattr->attr.name = mod->sect_attrs->attrs[loaded].name; -- cgit v1.2.3 From 5ded3dc6a3c7549b36a8ac27bbd81b33756a2c29 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 6 Jan 2010 17:12:07 -0800 Subject: ring-buffer: Wrap a list.next reference with rb_list_head() This reference at the end of rb_get_reader_page() was causing off-by-one writes to the prev pointer of the page after the reader page when that page is the head page, and therefore the reader page has the RB_PAGE_HEAD flag in its list.next pointer. This eventually results in a GPF in a subsequent call to rb_set_head_page() (usually from rb_get_reader_page()) when that prev pointer is dereferenced. The dereferenced register would characteristically have an address that appears shifted left by one byte (eg, ffxxxxxxxxxxxxyy instead of ffffxxxxxxxxxxxx) due to being written at an address one byte too high. Signed-off-by: David Sharp LKML-Reference: <1262826727-9090-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..d5b7308b7e1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2906,7 +2906,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * * Now make the new head point back to the reader page. */ - reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ -- cgit v1.2.3 From 0e1ff5d72a6393f2ef5dbf74f58bb55a12d63834 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jan 2010 20:40:44 -0500 Subject: ring-buffer: Add rb_list_head() wrapper around new reader page next field If the very unlikely case happens where the writer moves the head by one between where the head page is read and where the new reader page is assigned _and_ the writer then writes and wraps the entire ring buffer so that the head page is back to what was originally read as the head page, the page to be swapped will have a corrupted next pointer. Simple solution is to wrap the assignment of the next pointer with a rb_list_head(). Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d5b7308b7e1b..edefe3b2801b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2869,7 +2869,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); - cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* -- cgit v1.2.3 From 8767ba2796a1c894e6d9524584a26a8224f0543d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 8 Jan 2010 14:42:38 -0800 Subject: kmod: fix resource leak in call_usermodehelper_pipe() Fix resource (write-pipe file) leak in call_usermodehelper_pipe(). When call_usermodehelper_exec() fails, write-pipe file is opened and call_usermodehelper_pipe() just returns an error. Since it is hard for caller to determine whether the error occured when opening the pipe or executing the helper, the caller cannot close the pipe by themselves. I've found this resoruce leak when testing coredump. You can check how the resource leaks as below; $ echo "|nocommand" > /proc/sys/kernel/core_pattern $ ulimit -c unlimited $ while [ 1 ]; do ./segv; done &> /dev/null & $ cat /proc/meminfo (<- repeat it) where segv.c is; //----- int main () { char *p = 0; *p = 1; } //----- This patch closes write-pipe file if call_usermodehelper_exec() failed. Signed-off-by: Masami Hiramatsu Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 25b103190364..bf0e231d9702 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return -ENOMEM; ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) - goto out; + if (ret < 0) { + call_usermodehelper_freeinfo(sub_info); + return ret; + } - return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (ret < 0) /* Failed to execute helper, close pipe */ + filp_close(*filp, NULL); - out: - call_usermodehelper_freeinfo(sub_info); return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); -- cgit v1.2.3 From bd4f490a079730aadfaf9a728303ea0135c01945 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Fri, 8 Jan 2010 14:42:50 -0800 Subject: cgroups: fix 2.6.32 regression causing BUG_ON() in cgroup_diput() The LTP cgroup test suite generates a "kernel BUG at kernel/cgroup.c:790!" here in cgroup_diput(): /* * if we're getting rid of the cgroup, refcount should ensure * that there are no pidlists left. */ BUG_ON(!list_empty(&cgrp->pidlists)); The cgroup pidlist rework in 2.6.32 generates the BUG_ON, which is caused when pidlist_array_load() calls cgroup_pidlist_find(): (1) if a matching cgroup_pidlist is found, it down_write's the mutex of the pre-existing cgroup_pidlist, and increments its use_count. (2) if no matching cgroup_pidlist is found, then a new one is allocated, it down_write's its mutex, and the use_count is set to 0. (3) the matching, or new, cgroup_pidlist gets returned back to pidlist_array_load(), which increments its use_count -- regardless whether new or pre-existing -- and up_write's the mutex. So if a matching list is ever encountered by cgroup_pidlist_find() during the life of a cgroup directory, it results in an inflated use_count value, preventing it from ever getting released by cgroup_release_pid_array(). Then if the directory is subsequently removed, cgroup_diput() hits the BUG_ON() when it finds that the directory's cgroup is still populated with a pidlist. The patch simply removes the use_count increment when a matching pidlist is found by cgroup_pidlist_find(), because it gets bumped by the calling pidlist_array_load() function while still protected by the list's mutex. Signed-off-by: Dave Anderson Reviewed-by: Li Zefan Acked-by: Ben Blum Cc: Paul Menage Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0249f4be9b5c..1fbcc748044a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); - l->use_count++; return l; } } -- cgit v1.2.3 From b45c6e76bc2c72f6426c14bed64fdcbc9bf37cb0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 8 Jan 2010 14:42:52 -0800 Subject: kernel/signal.c: fix kernel information leak with print-fatal-signals=1 When print-fatal-signals is enabled it's possible to dump any memory reachable by the kernel to the log by simply jumping to that address from user space. Or crash the system if there's some hardware with read side effects. The fatal signals handler will dump 16 bytes at the execution address, which is fully controlled by ring 3. In addition when something jumps to a unmapped address there will be up to 16 additional useless page faults, which might be potentially slow (and at least is not very efficient) Fortunately this option is off by default and only there on i386. But fix it by checking for kernel addresses and also stopping when there's a page fault. Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d09692b40376..934ae5e687b9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -979,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) for (i = 0; i < 16; i++) { unsigned char insn; - __get_user(insn, (unsigned char *)(regs->ip + i)); + if (get_user(insn, (unsigned char *)(regs->ip + i))) + break; printk("%02x ", insn); } } -- cgit v1.2.3 From 7485d0d3758e8e6491a5c9468114e74dc050785d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 5 Jan 2010 16:32:43 +0900 Subject: futexes: Remove rw parameter from get_futex_key() Currently, futexes have two problem: A) The current futex code doesn't handle private file mappings properly. get_futex_key() uses PageAnon() to distinguish file and anon, which can cause the following bad scenario: 1) thread-A call futex(private-mapping, FUTEX_WAIT), it sleeps on file mapping object. 2) thread-B writes a variable and it makes it cow. 3) thread-B calls futex(private-mapping, FUTEX_WAKE), it wakes up blocked thread on the anonymous page. (but it's nothing) B) Current futex code doesn't handle zero page properly. Read mode get_user_pages() can return zero page, but current futex code doesn't handle it at all. Then, zero page makes infinite loop internally. The solution is to use write mode get_user_page() always for page lookup. It prevents the lookup of both file page of private mappings and zero page. Performance concerns: Probaly very little, because glibc always initialize variables for futex before to call futex(). It means glibc users never see the overhead of this patch. Compatibility concerns: This patch has few compatibility issues. After this patch, FUTEX_WAIT require writable access to futex variables (read-only mappings makes EFAULT). But practically it's not a problem, glibc always initalizes variables for futexes explicitly - nobody uses read-only mappings. Reported-by: Hugh Dickins Signed-off-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Cc: Linus Torvalds Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: Ulrich Drepper LKML-Reference: <20100105162633.45A2.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8e3c3ffe1b9a..d9b3a2228f9d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); + err = get_user_pages_fast(address, 1, 1, &page); if (err < 0) return err; @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1175,11 +1173,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, */ retry: q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) return ret; @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out; @@ -2023,7 +2020,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_waiter.task = NULL; key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; -- cgit v1.2.3 From 751e9983ee276cb150e8812b1d995f6035a63878 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:02 +0800 Subject: ftrace: Fix MATCH_END_ONLY function filter For '*foo' pattern, we should allow any string ending with 'foo', but ftrace filter incorrectly disallows strings like bar_foo_foo: # echo '*io' > set_ftrace_filter # cat set_ftrace_filter | grep 'req_bio_endio' # cat available_filter_functions | grep 'req_bio_endio' req_bio_endio Signed-off-by: Li Zefan LKML-Reference: <4B4E870E.6060607@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7968762c8167..1e6640f80454 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; - char *ptr; + int slen; switch (type) { case MATCH_FULL: @@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type) matched = 1; break; case MATCH_END_ONLY: - ptr = strstr(str, regex); - if (ptr && (ptr[len] == 0)) + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) matched = 1; break; } -- cgit v1.2.3 From 285caad415f459f336247932b4db95a571357a02 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:21 +0800 Subject: tracing/filters: Fix MATCH_FRONT_ONLY filter matching MATCH_FRONT_ONLY actually is a full matching: # ./perf record -R -f -a -e lock:lock_acquire \ --filter 'name ~rcu_*' sleep 1 # ./perf trace (no output) We should pass the length of the pattern string to strncmp(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8721.5090301@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..11c3973e6552 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -261,7 +261,7 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.3 From a3291c14ecf0a995e30d993b7f2cae031de98727 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:53:41 +0800 Subject: tracing/filters: Fix MATCH_END_ONLY filter matching For '*foo' pattern, we should allow any string ending with 'foo', but event filtering incorrectly disallows strings like bar_foo_foo: Signed-off-by: Li Zefan LKML-Reference: <4B4E8735.6070604@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 11c3973e6552..49e44dd17851 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -275,9 +275,10 @@ static int regex_match_middle(char *str, struct regex *r, int len) static int regex_match_end(char *str, struct regex *r, int len) { - char *ptr = strstr(str, r->pattern); + int strlen = len - 1; - if (ptr && (ptr[r->len] == 0)) + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) return 1; return 0; } -- cgit v1.2.3 From b2af211f284eb1bef19fbb85fc8ef551bb1e7460 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:11 +0800 Subject: tracing/filters: Fix MATCH_MIDDLE_ONLY filter matching The @str might not be NULL-terminated if it's of type DYN_STRING or STATIC_STRING, so we should use strnstr() instead of strstr(). Signed-off-by: Li Zefan LKML-Reference: <4B4E8753.2000102@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49e44dd17851..f364b085397e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -268,7 +268,7 @@ static int regex_match_front(char *str, struct regex *r, int len) static int regex_match_middle(char *str, struct regex *r, int len) { - if (strstr(str, r->pattern)) + if (strnstr(str, r->pattern, len)) return 1; return 0; } -- cgit v1.2.3 From 16da27a8bc7a0d050686d1b2e9efb53fab9ed226 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:27 +0800 Subject: tracing/filters: Fix MATCH_FULL filter matching for PTR_STRING MATCH_FULL matching for PTR_STRING is not working correctly: # echo 'func == vt' > events/bkl/lock_kernel/filter # echo 1 > events/bkl/lock_kernel/enable ... # cat trace Xorg-1484 [000] 1973.392586: lock_kernel: ... func=vt_ioctl() gpm-1402 [001] 1974.027740: lock_kernel: ... func=vt_ioctl() We should pass to regex.match(..., len) the length (including '\0') of the source string instead of the length of the pattern string. Signed-off-by: Li Zefan LKML-Reference: <4B4E8763.5070707@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f364b085397e..60c2a4efad4a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, { char **addr = (char **)(event + pred->offset); int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex.match(*addr, &pred->regex, len); match = cmp ^ pred->not; @@ -782,10 +783,8 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - else { + else fn = filter_pred_pchar; - pred->regex.field_len = strlen(pred->regex.pattern); - } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); -- cgit v1.2.3 From d1303dd1d6b220cab375f24fa91a5640e54e169e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 14 Jan 2010 10:54:40 +0800 Subject: tracing/filters: Add comment for match callbacks We should be clear on 2 things: - the length parameter of a match callback includes tailing '\0'. - the string to be searched might not be NULL-terminated. Signed-off-by: Li Zefan LKML-Reference: <4B4E8770.7000608@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 60c2a4efad4a..e42af9aad69f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } -/* Basic regex callbacks */ +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + static int regex_match_full(char *str, struct regex *r, int len) { if (strncmp(str, r->pattern, len) == 0) -- cgit v1.2.3 From 8ecc2951534af10e04ddb5e5ff5c6d217b79f5c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:12 -0800 Subject: kfifo: use void * pointers for user buffers The pointers to user buffers are currently unsigned char *, which requires a lot of casting in the caller for any non-char typed buffers. Use void * instead. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index e92d519f93b1..ab615e695052 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,7 +28,7 @@ #include #include -static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, +static void _kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { fifo->buffer = buffer; @@ -44,7 +44,7 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @size: the size of the internal buffer, this have to be a power of 2. * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) +void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); @@ -235,7 +235,7 @@ EXPORT_SYMBOL(__kfifo_in_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, +unsigned int kfifo_in(struct kfifo *fifo, const void *from, unsigned int len) { len = min(kfifo_avail(fifo), len); @@ -277,7 +277,7 @@ EXPORT_SYMBOL(__kfifo_out_n); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) { len = min(kfifo_len(fifo), len); -- cgit v1.2.3 From 64ce1037c5434b1d036cd99ecaee6e00496bc2e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:15 -0800 Subject: kfifo: sanitize *_user error handling Right now for kfifo_*_user it's not easily possible to distingush between a user copy failing and the FIFO not containing enough data. The problem is that both conditions are multiplexed into the same return code. Avoid this by moving the "copy length" into a separate output parameter and only return 0/-EFAULT in the main return value. I didn't fully adapt the weird "record" variants, those seem to be unused anyways and were rather messy (should they be just removed?) I would appreciate some double checking if I did all the conversions correctly. Signed-off-by: Andi Kleen Cc: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 76 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index ab615e695052..b50bb622e8b0 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -159,8 +159,9 @@ static inline void __kfifo_out_data(struct kfifo *fifo, memcpy(to + l, fifo->buffer, len - l); } -static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off) +static inline int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned *lenout) { unsigned int l; int ret; @@ -177,16 +178,20 @@ static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); - - if (unlikely(ret)) - return ret + len - l; + if (unlikely(ret)) { + *lenout = ret; + return -EFAULT; + } + *lenout = l; /* then put the rest (if any) at the beginning of the buffer */ - return copy_from_user(fifo->buffer, from + l, len - l); + ret = copy_from_user(fifo->buffer, from + l, len - l); + *lenout += ret ? ret : len - l; + return ret ? -EFAULT : 0; } -static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off) +static inline int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off, unsigned *lenout) { unsigned int l; int ret; @@ -203,12 +208,21 @@ static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); ret = copy_to_user(to, fifo->buffer + off, l); - - if (unlikely(ret)) - return ret + len - l; + *lenout = l; + if (unlikely(ret)) { + *lenout -= ret; + return -EFAULT; + } /* then get the rest (if any) from the beginning of the buffer */ - return copy_to_user(to + l, fifo->buffer, len - l); + len -= l; + ret = copy_to_user(to + l, fifo->buffer, len); + if (unlikely(ret)) { + *lenout += len - ret; + return -EFAULT; + } + *lenout += len; + return 0; } unsigned int __kfifo_in_n(struct kfifo *fifo, @@ -299,10 +313,13 @@ EXPORT_SYMBOL(__kfifo_out_generic); unsigned int __kfifo_from_user_n(struct kfifo *fifo, const void __user *from, unsigned int len, unsigned int recsize) { + unsigned total; + if (kfifo_avail(fifo) < len + recsize) return len + 1; - return __kfifo_from_user_data(fifo, from, len, recsize); + __kfifo_from_user_data(fifo, from, len, recsize, &total); + return total; } EXPORT_SYMBOL(__kfifo_from_user_n); @@ -313,18 +330,21 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @len: the length of the data to be added. * * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. + * FIFO depending and returns -EFAULT/0. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned *total) { + int ret; len = min(kfifo_avail(fifo), len); - len -= __kfifo_from_user_data(fifo, from, len, 0); + ret = __kfifo_from_user_data(fifo, from, len, 0, total); + if (ret) + return ret; __kfifo_add_in(fifo, len); - return len; + return 0; } EXPORT_SYMBOL(kfifo_from_user); @@ -339,17 +359,17 @@ unsigned int __kfifo_to_user_n(struct kfifo *fifo, void __user *to, unsigned int len, unsigned int reclen, unsigned int recsize) { - unsigned int ret; + unsigned int ret, total; if (kfifo_len(fifo) < reclen + recsize) return len; - ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); if (likely(ret == 0)) __kfifo_add_out(fifo, reclen + recsize); - return ret; + return total; } EXPORT_SYMBOL(__kfifo_to_user_n); @@ -358,20 +378,22 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. + @ @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. + * @to buffer and 0 or -EFAULT. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned *lenout) { + int ret; len = min(kfifo_len(fifo), len); - len -= __kfifo_to_user_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); - return len; + ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); + __kfifo_add_out(fifo, *lenout); + return ret; } EXPORT_SYMBOL(kfifo_to_user); -- cgit v1.2.3 From a5b9e2c1063046421ce01dcf5ddd7ec12567f3e1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:16 -0800 Subject: kfifo: add kfifo_out_peek In some upcoming code it's useful to peek into a FIFO without permanentely removing data. This patch implements a new kfifo_out_peek() to do this. Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index b50bb622e8b0..7384f120be87 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -302,6 +302,27 @@ unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) } EXPORT_SYMBOL(kfifo_out); +/** + * kfifo_out_peek - copy some data from the FIFO, but do not remove it + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @offset: offset into the fifo + * + * This function copies at most @len bytes at @offset from the FIFO + * into the @to buffer and returns the number of copied bytes. + * The data is not removed from the FIFO. + */ +unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, + unsigned offset) +{ + len = min(kfifo_len(fifo), len + offset); + + __kfifo_out_data(fifo, to, len, offset); + return len; +} +EXPORT_SYMBOL(kfifo_out_peek); + unsigned int __kfifo_out_generic(struct kfifo *fifo, void *to, unsigned int len, unsigned int recsize, unsigned int *total) -- cgit v1.2.3 From 5dab600e6a153ceb64832f608069e6c08185411a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Jan 2010 17:01:17 -0800 Subject: kfifo: document everywhere that size has to be power of two On my first try using them I missed that the fifos need to be power of two, resulting in a runtime bug. Document that requirement everywhere (and fix one grammar bug) Signed-off-by: Andi Kleen Acked-by: Stefani Seibold Cc: Roland Dreier Cc: Dmitry Torokhov Cc: Andy Walls Cc: Vikram Dhillon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 7384f120be87..32c5c15d750d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -41,7 +41,7 @@ static void _kfifo_init(struct kfifo *fifo, void *buffer, * kfifo_init - initialize a FIFO using a preallocated buffer * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. + * @size: the size of the internal buffer, this has to be a power of 2. * */ void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) -- cgit v1.2.3 From af2422c42c0ff42b8b93dbb3a5fe65250fb65c40 Mon Sep 17 00:00:00 2001 From: David John Date: Fri, 15 Jan 2010 17:01:23 -0800 Subject: smp_call_function_any(): pass the node value to cpumask_of_node() The change in acpi_cpufreq to use smp_call_function_any causes a warning when it is called since the function erroneously passes the cpu id to cpumask_of_node rather than the node that the cpu is on. Fix this. cpumask_of_node(3): node > nr_node_ids(1) Pid: 1, comm: swapper Not tainted 2.6.33-rc3-00097-g2c1f189 #223 Call Trace: [] cpumask_of_node+0x23/0x58 [] smp_call_function_any+0x65/0xfa [] ? do_drv_read+0x0/0x2f [] get_cur_val+0xb0/0x102 [] get_cur_freq_on_cpu+0x74/0xc5 [] acpi_cpufreq_cpu_init+0x417/0x515 [] ? __down_write+0xb/0xd [] cpufreq_add_dev+0x278/0x922 Signed-off-by: David John Cc: Suresh Siddha Cc: Rusty Russell Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index de735a6637d0..f10408422444 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask, goto call; /* Try for same node. */ - nodemask = cpumask_of_node(cpu); + nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) -- cgit v1.2.3 From 50b926e439620c469565e8be0f28be78f5fca1ce Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 4 Jan 2010 14:44:56 +0100 Subject: sched: Fix vmark regression on big machines SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't enabled, leading to many cache misses on large machines as we traverse looking for an idle shared cache to wake to. Change the enabler of select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the sibling domain level. Reported-by: Lin Ming Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1262612696.15495.15.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42ac3c9f66f6..8fe7ee81c552 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { -- cgit v1.2.3 From 6d558c3ac9b6508d26fd5cadccce51fc9d726b1c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 11 Jan 2010 14:21:25 +0800 Subject: sched: Reassign prev and switch_count when reacquire_kernel_lock() fail Assume A->B schedule is processing, if B have acquired BKL before and it need reschedule this time. Then on B's context, it will go to need_resched_nonpreemptible for reschedule. But at this time, prev and switch_count are related to A. It's wrong and will lead to incorrect scheduler statistics. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <2674af741001102238w7b0ddcadref00d345e2181d11@mail.gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f6428..4508fe7048be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5530,8 +5530,11 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (need_resched()) -- cgit v1.2.3 From fe432200abb0d64f409895168d9ad8fbb9d8e6c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:08:26 +0100 Subject: perf: Fix perf_event_do_pending() fallback callsite Paul questioned the context in which we should call perf_event_do_pending(). After looking at that I found that it should be called from IRQ context these days, however the fallback call-site is placed in softirq context. Ammend this by placing the callback in the IRQ timer path. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1263374859.4244.192.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/timer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 15533b792397..c61a7949387f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1198,6 +1198,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); + perf_event_do_pending(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_event_do_pending(); - hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) -- cgit v1.2.3 From 22e190851f8709c48baf00ed9ce6144cdc54d025 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2010 09:12:32 +0100 Subject: perf: Honour event state for aux stream data Anton reported that perf record kept receiving events even after calling ioctl(PERF_EVENT_IOC_DISABLE). It turns out that FORK,COMM and MMAP events didn't respect the disabled state and kept flowing in. Reported-by: Anton Blanchard Signed-off-by: Peter Zijlstra Tested-by: Anton Blanchard LKML-Reference: <1263459187.4244.265.camel@laptop> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 603c0d8b5df1..d27746bd3a06 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3268,6 +3268,9 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3377,6 +3380,9 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3494,6 +3500,9 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; -- cgit v1.2.3