Skip to content

Commit

Permalink
Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Implement per-PMU context rescheduling to significantly improve
   single-PMU performance, and related cleanups/fixes (Peter Zijlstra
   and Namhyung Kim)

 - Fix ancient bug resulting in a lot of events being dropped
   erroneously at higher sampling frequencies (Luo Gengkun)

 - uprobes enhancements:

     - Implement RCU-protected hot path optimizations for better
       performance:

         "For baseline vs SRCU, peak througput increased from 3.7 M/s
          (million uprobe triggerings per second) up to about 8 M/s. For
          uretprobes it's a bit more modest with bump from 2.4 M/s to
          5 M/s.

          For SRCU vs RCU Tasks Trace, peak throughput for uprobes
          increases further from 8 M/s to 10.3 M/s (+28%!), and for
          uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more
          work to do on uretprobes side.

          Even single-thread (no contention) performance is slightly
          better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055
          M/s to 2.174 M/s (+5.8%) for uretprobes."

          (Andrii Nakryiko et al)

     - Document mmap_lock, don't abuse get_user_pages_remote() (Oleg
       Nesterov)

     - Cleanups & fixes to prepare for future work:
        - Remove uprobe_register_refctr()
	- Simplify error handling for alloc_uprobe()
        - Make uprobe_register() return struct uprobe *
        - Fold __uprobe_unregister() into uprobe_unregister()
        - Shift put_uprobe() from delete_uprobe() to uprobe_unregister()
        - BPF: Fix use-after-free in bpf_uprobe_multi_link_attach()
          (Oleg Nesterov)

 - New feature & ABI extension: allow events to use PERF_SAMPLE READ
   with inheritance, enabling sample based profiling of a group of
   counters over a hierarchy of processes or threads (Ben Gainey)

 - Intel uncore & power events updates:

      - Add Arrow Lake and Lunar Lake support
      - Add PERF_EV_CAP_READ_SCOPE
      - Clean up and enhance cpumask and hotplug support
        (Kan Liang)

      - Add LNL uncore iMC freerunning support
      - Use D0:F0 as a default device
        (Zhenyu Wang)

 - Intel PT: fix AUX snapshot handling race (Adrian Hunter)

 - Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and
   Peter Zijlstra)

* tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits)
  dmaengine: idxd: Clean up cpumask and hotplug for perfmon
  iommu/vt-d: Clean up cpumask and hotplug for perfmon
  perf/x86/intel/cstate: Clean up cpumask and hotplug
  perf: Add PERF_EV_CAP_READ_SCOPE
  perf: Generic hotplug support for a PMU with a scope
  uprobes: perform lockless SRCU-protected uprobes_tree lookup
  rbtree: provide rb_find_rcu() / rb_find_add_rcu()
  perf/uprobe: split uprobe_unregister()
  uprobes: travers uprobe's consumer list locklessly under SRCU protection
  uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks
  uprobes: protected uprobe lifetime with SRCU
  uprobes: revamp uprobe refcounting and lifetime management
  bpf: Fix use-after-free in bpf_uprobe_multi_link_attach()
  perf/core: Fix small negative period being ignored
  perf: Really fix event_function_call() locking
  perf: Optimize __pmu_ctx_sched_out()
  perf: Add context time freeze
  perf: Fix event_function_call() locking
  perf: Extract a few helpers
  perf: Optimize context reschedule for single PMU cases
  ...
  • Loading branch information
torvalds committed Sep 18, 2024
2 parents 941c122 + 5e645f3 commit 9f0c253
Show file tree
Hide file tree
Showing 21 changed files with 1,142 additions and 853 deletions.
63 changes: 63 additions & 0 deletions arch/x86/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/unwind.h>
#include <asm/uprobes.h>
#include <asm/ibt.h>

#include "perf_event.h"

Expand Down Expand Up @@ -2816,6 +2818,46 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}

#ifdef CONFIG_UPROBES
/*
* Heuristic-based check if uprobe is installed at the function entry.
*
* Under assumption of user code being compiled with frame pointers,
* `push %rbp/%ebp` is a good indicator that we indeed are.
*
* Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
* If we get this wrong, captured stack trace might have one extra bogus
* entry, but the rest of stack trace will still be meaningful.
*/
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
struct arch_uprobe *auprobe;

if (!current->utask)
return false;

auprobe = current->utask->auprobe;
if (!auprobe)
return false;

/* push %rbp/%ebp */
if (auprobe->insn[0] == 0x55)
return true;

/* endbr64 (64-bit only) */
if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
return true;

return false;
}

#else
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
return false;
}
#endif /* CONFIG_UPROBES */

#ifdef CONFIG_IA32_EMULATION

#include <linux/compat.h>
Expand All @@ -2827,6 +2869,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
u32 ret_addr;

if (user_64bit_mode(regs))
return 0;
Expand All @@ -2836,6 +2879,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent

fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();

/* see perf_callchain_user() below for why we do this */
if (is_uprobe_at_func_entry(regs) &&
!get_user(ret_addr, (const u32 __user *)regs->sp))
perf_callchain_store(entry, ret_addr);

while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
Expand Down Expand Up @@ -2864,6 +2913,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
{
struct stack_frame frame;
const struct stack_frame __user *fp;
unsigned long ret_addr;

if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
Expand All @@ -2887,6 +2937,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
return;

pagefault_disable();

/*
* If we are called from uprobe handler, and we are indeed at the very
* entry to user function (which is normally a `push %rbp` instruction,
* under assumption of application being compiled with frame pointers),
* we should read return address from *regs->sp before proceeding
* to follow frame pointers, otherwise we'll skip immediate caller
* as %rbp is not yet setup.
*/
if (is_uprobe_at_func_entry(regs) &&
!get_user(ret_addr, (const unsigned long __user *)regs->sp))
perf_callchain_store(entry, ret_addr);

while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
Expand Down
3 changes: 0 additions & 3 deletions arch/x86/events/intel/bts.c
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,6 @@ static int bts_event_init(struct perf_event *event)
* disabled, so disallow intel_bts driver for unprivileged
* users on paranoid systems since it provides trace data
* to the user in a zero-copy fashion.
*
* Note that the default paranoia setting permits unprivileged
* users to profile the kernel.
*/
if (event->attr.exclude_kernel) {
ret = perf_allow_kernel(&event->attr);
Expand Down
142 changes: 5 additions & 137 deletions arch/x86/events/intel/cstate.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(struct device *dev, \
static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)

static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf);

/* Model -> events mapping */
struct cstate_model {
unsigned long core_events;
Expand Down Expand Up @@ -206,22 +202,9 @@ static struct attribute_group cstate_format_attr_group = {
.attrs = cstate_format_attrs,
};

static cpumask_t cstate_core_cpu_mask;
static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);

static struct attribute *cstate_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};

static struct attribute_group cpumask_attr_group = {
.attrs = cstate_cpumask_attrs,
};

static const struct attribute_group *cstate_attr_groups[] = {
&cstate_events_attr_group,
&cstate_format_attr_group,
&cpumask_attr_group,
NULL,
};

Expand Down Expand Up @@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};

static cpumask_t cstate_pkg_cpu_mask;

/* cstate_module PMU */
static struct pmu cstate_module_pmu;
static bool has_cstate_module;
Expand All @@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};

static cpumask_t cstate_module_cpu_mask;

static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pmu *pmu = dev_get_drvdata(dev);

if (pmu == &cstate_core_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
else if (pmu == &cstate_pkg_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
else if (pmu == &cstate_module_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
else
return 0;
}

static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
int cpu;

if (event->attr.type != event->pmu->type)
return -ENOENT;
Expand All @@ -331,37 +293,24 @@ static int cstate_pmu_event_init(struct perf_event *event)
if (!(core_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_core_cpu_mask,
topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;

event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;

event->hw.event_base = pkg_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(event->cpu));
} else if (event->pmu == &cstate_module_pmu) {
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX);
if (!(module_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = module_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_module_cpu_mask,
topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}

if (cpu >= nr_cpu_ids)
return -ENODEV;

event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
return 0;
Expand Down Expand Up @@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode)
return 0;
}

/*
* Check if exiting cpu is the designated reader. If so migrate the
* events when there is a valid target available
*/
static int cstate_cpu_exit(unsigned int cpu)
{
unsigned int target;

if (has_cstate_core &&
cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {

target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_core_cpu_mask);
perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
}
}

if (has_cstate_pkg &&
cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {

target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
}
}

if (has_cstate_module &&
cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {

target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_module_cpu_mask);
perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
}
}
return 0;
}

static int cstate_cpu_init(unsigned int cpu)
{
unsigned int target;

/*
* If this is the first online thread of that core, set it in
* the core cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_core_cpu_mask,
topology_sibling_cpumask(cpu));

if (has_cstate_core && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_core_cpu_mask);

/*
* If this is the first online thread of that package, set it
* in the package cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(cpu));
if (has_cstate_pkg && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);

/*
* If this is the first online thread of that cluster, set it
* in the cluster cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_module_cpu_mask,
topology_cluster_cpumask(cpu));
if (has_cstate_module && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_module_cpu_mask);

return 0;
}

static const struct attribute_group *core_attr_update[] = {
&group_cstate_core_c1,
&group_cstate_core_c3,
Expand Down Expand Up @@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_CORE,
.module = THIS_MODULE,
};

Expand All @@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_PKG,
.module = THIS_MODULE,
};

Expand All @@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_CLUSTER,
.module = THIS_MODULE,
};

Expand Down Expand Up @@ -810,9 +684,6 @@ static int __init cstate_probe(const struct cstate_model *cm)

static inline void cstate_cleanup(void)
{
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);

if (has_cstate_core)
perf_pmu_unregister(&cstate_core_pmu);

Expand All @@ -827,11 +698,6 @@ static int __init cstate_init(void)
{
int err;

cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
"perf/x86/cstate:starting", cstate_cpu_init, NULL);
cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
"perf/x86/cstate:online", NULL, cstate_cpu_exit);

if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
if (err) {
Expand All @@ -844,6 +710,8 @@ static int __init cstate_init(void)

if (has_cstate_pkg) {
if (topology_max_dies_per_package() > 1) {
/* CLX-AP is multi-die and the cstate is die-scope */
cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
err = perf_pmu_register(&cstate_pkg_pmu,
"cstate_die", -1);
} else {
Expand Down
Loading

0 comments on commit 9f0c253

Please sign in to comment.