diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-03-14 12:57:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-03-14 12:57:17 -0700 |
commit | 75013c6c52d80b2255ba273eedac013d58754b02 (patch) | |
tree | 5fd3a34b9f98f3de25a5d192beb1033dd4e4497b | |
parent | 836d7f0572ca42ac85d649235680479740743ac6 (diff) | |
parent | c8e2fe13d1d1f3a02842b7b909d4e4846a4b6a2c (diff) |
Merge tag 'perf_urgent_for_v5.12-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Borislav Petkov:
- Make sure PMU internal buffers are flushed for per-CPU events too and
properly handle PID/TID for large PEBS.
- Handle the case properly when there's no PMU and therefore return an
empty list of perf MSRs for VMX to switch instead of reading random
garbage from the stack.
* tag 'perf_urgent_for_v5.12-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/perf: Use RET0 as default for guest_get_msrs to handle "no PMU" case
perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
perf/core: Flush PMU internal buffers for per-CPU events
-rw-r--r-- | arch/x86/events/core.c | 15 | ||||
-rw-r--r-- | arch/x86/events/intel/core.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 2 | ||||
-rw-r--r-- | include/linux/perf_event.h | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 42 |
5 files changed, 51 insertions, 15 deletions
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6ddeed3cd2ac..18df17129695 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -81,7 +81,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); -DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -1944,13 +1948,6 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } -static inline struct perf_guest_switch_msr * -perf_guest_get_msrs_nop(int *nr) -{ - *nr = 0; - return NULL; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2025,7 +2022,7 @@ static int __init init_hw_perf_events(void) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; x86_pmu_static_call_update(); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d5c18e..7bbb5bb98d8c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50810d471462..32cf8287d4a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) int i, nr_msrs; struct perf_guest_switch_msr *msrs; + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); - if (!msrs) return; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fab42cfbd350..3f7f89ea5e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -606,6 +606,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; @@ -872,6 +873,7 @@ struct perf_cpu_context { struct list_head cgrp_cpuctx_entry; #endif + struct list_head sched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0aeca5f3c0ac..03db40f6cba9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - cpuctx->sched_cb_usage++; + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + + if (prev == next) + return; + + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + /* will be handled in perf_event_context_sched_in/out */ + if (cpuctx->task_ctx) + continue; + + __perf_pmu_sched_task(cpuctx, sched_in); + } +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); @@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); @@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } |