From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: perf: Add cgroup support This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'include/linux/perf_event.h') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dda5b0a3ff60..38c8b2554842 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -464,6 +464,7 @@ enum perf_callchain_context { #define PERF_FLAG_FD_NO_GROUP (1U << 0) #define PERF_FLAG_FD_OUTPUT (1U << 1) +#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ #ifdef __KERNEL__ /* @@ -471,6 +472,7 @@ enum perf_callchain_context { */ #ifdef CONFIG_PERF_EVENTS +# include # include # include #endif @@ -716,6 +718,22 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 +#ifdef CONFIG_CGROUP_PERF +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; /* timing info, one per cpu */ +}; +#endif + /** * struct perf_event - performance event kernel representation: */ @@ -832,6 +850,11 @@ struct perf_event { struct event_filter *filter; #endif +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; /* cgroup event is attach to */ + int cgrp_defer_enabled; +#endif + #endif /* CONFIG_PERF_EVENTS */ }; @@ -886,6 +909,7 @@ struct perf_event_context { u64 generation; int pin_count; struct rcu_head rcu_head; + int nr_cgroups; /* cgroup events present */ }; /* @@ -905,6 +929,9 @@ struct perf_cpu_context { struct list_head rotation_list; int jiffies_interval; struct pmu *active_pmu; +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { @@ -1040,11 +1067,11 @@ have_event: __perf_sw_event(event_id, nr, nmi, regs, addr); } -extern atomic_t perf_task_events; +extern atomic_t perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *task) { - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); } static inline @@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); } extern void perf_event_mmap(struct vm_area_struct *vma); -- cgit v1.2.3 From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2011 11:22:34 +0100 Subject: perf: Optimize throttling code By pre-computing the maximum number of samples per tick we can avoid a multiplication and a conditional since MAX_INTERRUPTS > max_samples_per_tick. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/perf_event.h') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 38c8b2554842..8ceb5a6fd9c9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1110,6 +1110,10 @@ extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +extern int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; -- cgit v1.2.3 From a7e3ed1e470116c9d12c2f778431a481a6be8ab6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 3 Mar 2011 10:34:47 +0800 Subject: perf: Add support for supplementary event registers Change logs against Andi's original version: - Extends perf_event_attr:config to config{,1,2} (Peter Zijlstra) - Fixed a major event scheduling issue. There cannot be a ref++ on an event that has already done ref++ once and without calling put_constraint() in between. (Stephane Eranian) - Use thread_cpumask for percore allocation. (Lin Ming) - Use MSR names in the extra reg lists. (Lin Ming) - Remove redundant "c = NULL" in intel_percore_constraints - Fix comment of perf_event_attr::config1 Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event that can be used to monitor any offcore accesses from a core. This is a very useful event for various tunings, and it's also needed to implement the generic LLC-* events correctly. Unfortunately this event requires programming a mask in a separate register. And worse this separate register is per core, not per CPU thread. This patch: - Teaches perf_events that OFFCORE_RESPONSE needs extra parameters. The extra parameters are passed by user space in the perf_event_attr::config1 field. - Adds support to the Intel perf_event core to schedule per core resources. This adds fairly generic infrastructure that can be also used for other per core resources. The basic code has is patterned after the similar AMD northbridge constraints code. Thanks to Stephane Eranian who pointed out some problems in the original version and suggested improvements. Signed-off-by: Andi Kleen Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1299119690-13991-2-git-send-email-ming.m.lin@intel.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux/perf_event.h') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ceb5a6fd9c9..614615b8d42b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -225,8 +225,14 @@ struct perf_event_attr { }; __u32 bp_type; - __u64 bp_addr; - __u64 bp_len; + union { + __u64 bp_addr; + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 config2; /* extension of config1 */ + }; }; /* @@ -541,6 +547,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + unsigned int extra_reg; + u64 extra_config; + int extra_alloc; }; struct { /* software */ struct hrtimer hrtimer; -- cgit v1.2.3