perf stat: Refactor retry/skip/fatal error handling

For the sake of Intel topdown events commit 9eac5612da ("perf
stat: Don't skip failing group events") changed 'perf stat' error
handling making it so that more errors were fatal and didn't report
"<not supported>" events. The change outside of topdown events was
unintentional.

The notion of "fatal" error handling was introduced in commit
e0e6a6ca3a ("perf stat: Factor out open error handling") and
refined in commits like commit cb5ef60067 ("perf stat: Error out
unsupported group leader immediately") to be an approach for avoiding
later assertion failures in the code base.

This change fixes those issues and removes the notion of a fatal error
on an event. If all events fail to open then a fatal error occurs with
the previous fatal error message. This seems to best match the notion of
supported events and allowing some errors not to stop 'perf stat', while
allowing the truly fatal no event case to terminate the tool early.

The evsel->errored flag is only used in the stat code but always just
meaning !evsel->supported although there is a comment about it being
sticky. Force all evsels to be supported in evsel__init and then clear
this when evsel__open fails. When an event is tried the supported is
set to true again. This simplifies the notion of whether an evsel is
broken.

In the get_group_fd code, fail to get a group fd when the evsel isn't
supported. If the leader isn't supported then it is also expected that
there is no group_fd as the leader will have been skipped. Therefore
change the BUG_ON test to be on supported rather than skippable. This
corrects the assertion errors that were the reason for the previous
fatal error handling.

Fixes: 9eac5612da ("perf stat: Don't skip failing group events")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Link: https://lore.kernel.org/r/20251002220727.1889799-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Ian Rogers 2025-10-02 15:07:27 -07:00 committed by Arnaldo Carvalho de Melo
parent 6026ab657a
commit 2cc7aa995c
4 changed files with 77 additions and 91 deletions

View File

@ -1408,8 +1408,6 @@ static int record__open(struct record *rec)
ui__error("%s\n", msg); ui__error("%s\n", msg);
goto out; goto out;
} }
pos->supported = true;
} }
if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) { if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {

View File

@ -610,22 +610,13 @@ static int dispatch_events(bool forks, int timeout, int interval, int *times)
enum counter_recovery { enum counter_recovery {
COUNTER_SKIP, COUNTER_SKIP,
COUNTER_RETRY, COUNTER_RETRY,
COUNTER_FATAL,
}; };
static enum counter_recovery stat_handle_error(struct evsel *counter, int err) static enum counter_recovery stat_handle_error(struct evsel *counter, int err)
{ {
char msg[BUFSIZ]; char msg[BUFSIZ];
if (counter->skippable) { assert(!counter->supported);
if (verbose > 0) {
ui__warning("skipping event %s that kernel failed to open .\n",
evsel__name(counter));
}
counter->supported = false;
counter->errored = true;
return COUNTER_SKIP;
}
/* /*
* PPC returns ENXIO for HW counters until 2.6.37 * PPC returns ENXIO for HW counters until 2.6.37
@ -636,19 +627,16 @@ static enum counter_recovery stat_handle_error(struct evsel *counter, int err)
ui__warning("%s event is not supported by the kernel.\n", ui__warning("%s event is not supported by the kernel.\n",
evsel__name(counter)); evsel__name(counter));
} }
counter->supported = false; return COUNTER_SKIP;
/* }
* errored is a sticky flag that means one of the counter's if (evsel__fallback(counter, &target, err, msg, sizeof(msg))) {
* cpu event had a problem and needs to be reexamined.
*/
counter->errored = true;
} else if (evsel__fallback(counter, &target, err, msg, sizeof(msg))) {
if (verbose > 0) if (verbose > 0)
ui__warning("%s\n", msg); ui__warning("%s\n", msg);
counter->supported = true;
return COUNTER_RETRY; return COUNTER_RETRY;
} else if (target__has_per_thread(&target) && err != EOPNOTSUPP && }
evsel_list->core.threads && if (target__has_per_thread(&target) && err != EOPNOTSUPP &&
evsel_list->core.threads->err_thread != -1) { evsel_list->core.threads && evsel_list->core.threads->err_thread != -1) {
/* /*
* For global --per-thread case, skip current * For global --per-thread case, skip current
* error thread. * error thread.
@ -656,24 +644,17 @@ static enum counter_recovery stat_handle_error(struct evsel *counter, int err)
if (!thread_map__remove(evsel_list->core.threads, if (!thread_map__remove(evsel_list->core.threads,
evsel_list->core.threads->err_thread)) { evsel_list->core.threads->err_thread)) {
evsel_list->core.threads->err_thread = -1; evsel_list->core.threads->err_thread = -1;
counter->supported = true;
return COUNTER_RETRY; return COUNTER_RETRY;
} }
} else if (err == EOPNOTSUPP) { }
if (verbose > 0) { if (verbose > 0) {
ui__warning("%s event is not supported by the kernel.\n", ui__warning(err == EOPNOTSUPP
? "%s event is not supported by the kernel.\n"
: "skipping event %s that kernel failed to open.\n",
evsel__name(counter)); evsel__name(counter));
} }
counter->supported = false; return COUNTER_SKIP;
counter->errored = true;
}
evsel__open_strerror(counter, &target, err, msg, sizeof(msg));
ui__error("%s\n", msg);
if (child_pid != -1)
kill(child_pid, SIGTERM);
return COUNTER_FATAL;
} }
static int create_perf_stat_counter(struct evsel *evsel, static int create_perf_stat_counter(struct evsel *evsel,
@ -746,8 +727,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
struct evlist_cpu_iterator evlist_cpu_itr; struct evlist_cpu_iterator evlist_cpu_itr;
struct affinity saved_affinity, *affinity = NULL; struct affinity saved_affinity, *affinity = NULL;
int err; int err, open_err = 0;
bool second_pass = false; bool second_pass = false, has_supported_counters;
if (forks) { if (forks) {
if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) { if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) {
@ -787,14 +768,17 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
if (target.use_bpf) if (target.use_bpf)
break; break;
if (counter->reset_group || counter->errored) if (counter->reset_group || !counter->supported)
continue; continue;
if (evsel__is_bperf(counter)) if (evsel__is_bperf(counter))
continue; continue;
try_again:
if (create_perf_stat_counter(counter, &stat_config,
evlist_cpu_itr.cpu_map_idx) < 0) {
while (true) {
if (create_perf_stat_counter(counter, &stat_config,
evlist_cpu_itr.cpu_map_idx) == 0)
break;
open_err = errno;
/* /*
* Weak group failed. We cannot just undo this here * Weak group failed. We cannot just undo this here
* because earlier CPUs might be in group mode, and the kernel * because earlier CPUs might be in group mode, and the kernel
@ -802,29 +786,19 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
* it to later. * it to later.
* Don't close here because we're in the wrong affinity. * Don't close here because we're in the wrong affinity.
*/ */
if ((errno == EINVAL || errno == EBADF) && if ((open_err == EINVAL || open_err == EBADF) &&
evsel__leader(counter) != counter && evsel__leader(counter) != counter &&
counter->weak_group) { counter->weak_group) {
evlist__reset_weak_group(evsel_list, counter, false); evlist__reset_weak_group(evsel_list, counter, false);
assert(counter->reset_group); assert(counter->reset_group);
counter->supported = true;
second_pass = true; second_pass = true;
continue;
}
switch (stat_handle_error(counter, errno)) {
case COUNTER_FATAL:
err = -1;
goto err_out;
case COUNTER_RETRY:
goto try_again;
case COUNTER_SKIP:
continue;
default:
break; break;
} }
if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
break;
} }
counter->supported = true;
} }
if (second_pass) { if (second_pass) {
@ -837,7 +811,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
counter = evlist_cpu_itr.evsel; counter = evlist_cpu_itr.evsel;
if (!counter->reset_group && !counter->errored) if (!counter->reset_group && counter->supported)
continue; continue;
perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx); perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
@ -848,34 +822,29 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
if (!counter->reset_group) if (!counter->reset_group)
continue; continue;
try_again_reset:
while (true) {
pr_debug2("reopening weak %s\n", evsel__name(counter)); pr_debug2("reopening weak %s\n", evsel__name(counter));
if (create_perf_stat_counter(counter, &stat_config, if (create_perf_stat_counter(counter, &stat_config,
evlist_cpu_itr.cpu_map_idx) < 0) { evlist_cpu_itr.cpu_map_idx) == 0)
break;
switch (stat_handle_error(counter, errno)) { open_err = errno;
case COUNTER_FATAL: if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
err = -1;
goto err_out;
case COUNTER_RETRY:
goto try_again_reset;
case COUNTER_SKIP:
continue;
default:
break; break;
} }
} }
counter->supported = true;
}
} }
affinity__cleanup(affinity); affinity__cleanup(affinity);
affinity = NULL; affinity = NULL;
has_supported_counters = false;
evlist__for_each_entry(evsel_list, counter) { evlist__for_each_entry(evsel_list, counter) {
if (!counter->supported) { if (!counter->supported) {
perf_evsel__free_fd(&counter->core); perf_evsel__free_fd(&counter->core);
continue; continue;
} }
has_supported_counters = true;
l = strlen(counter->unit); l = strlen(counter->unit);
if (l > stat_config.unit_width) if (l > stat_config.unit_width)
@ -887,6 +856,16 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
goto err_out; goto err_out;
} }
} }
if (!has_supported_counters) {
evsel__open_strerror(evlist__first(evsel_list), &target, open_err,
msg, sizeof(msg));
ui__error("No supported events found.\n%s\n", msg);
if (child_pid != -1)
kill(child_pid, SIGTERM);
err = -1;
goto err_out;
}
if (evlist__apply_filters(evsel_list, &counter, &target)) { if (evlist__apply_filters(evsel_list, &counter, &target)) {
pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",

View File

@ -407,6 +407,7 @@ void evsel__init(struct evsel *evsel,
evsel->collect_stat = false; evsel->collect_stat = false;
evsel->group_pmu_name = NULL; evsel->group_pmu_name = NULL;
evsel->skippable = false; evsel->skippable = false;
evsel->supported = true;
evsel->alternate_hw_config = PERF_COUNT_HW_MAX; evsel->alternate_hw_config = PERF_COUNT_HW_MAX;
evsel->script_output_type = -1; // FIXME: OUTPUT_TYPE_UNSET, see builtin-script.c evsel->script_output_type = -1; // FIXME: OUTPUT_TYPE_UNSET, see builtin-script.c
} }
@ -1941,7 +1942,7 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
struct evsel *leader = evsel__leader(evsel); struct evsel *leader = evsel__leader(evsel);
int fd; int fd;
if (evsel__is_group_leader(evsel)) if (!evsel->supported || evsel__is_group_leader(evsel))
return -1; return -1;
/* /*
@ -1955,7 +1956,7 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
return -1; return -1;
fd = FD(leader, cpu_map_idx, thread); fd = FD(leader, cpu_map_idx, thread);
BUG_ON(fd == -1 && !leader->skippable); BUG_ON(fd == -1 && leader->supported);
/* /*
* When the leader has been skipped, return -2 to distinguish from no * When the leader has been skipped, return -2 to distinguish from no
@ -2573,12 +2574,14 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
enum rlimit_action set_rlimit = NO_CHANGE; enum rlimit_action set_rlimit = NO_CHANGE;
struct perf_cpu cpu; struct perf_cpu cpu;
if (evsel__is_retire_lat(evsel)) if (evsel__is_retire_lat(evsel)) {
return evsel__tpebs_open(evsel); err = evsel__tpebs_open(evsel);
goto out;
}
err = __evsel__prepare_open(evsel, cpus, threads); err = __evsel__prepare_open(evsel, cpus, threads);
if (err) if (err)
return err; goto out;
if (cpus == NULL) if (cpus == NULL)
cpus = empty_cpu_map; cpus = empty_cpu_map;
@ -2598,19 +2601,22 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
display_attr(&evsel->core.attr); display_attr(&evsel->core.attr);
if (evsel__is_tool(evsel)) { if (evsel__is_tool(evsel)) {
return evsel__tool_pmu_open(evsel, threads, err = evsel__tool_pmu_open(evsel, threads,
start_cpu_map_idx, start_cpu_map_idx,
end_cpu_map_idx); end_cpu_map_idx);
goto out;
} }
if (evsel__is_hwmon(evsel)) { if (evsel__is_hwmon(evsel)) {
return evsel__hwmon_pmu_open(evsel, threads, err = evsel__hwmon_pmu_open(evsel, threads,
start_cpu_map_idx, start_cpu_map_idx,
end_cpu_map_idx); end_cpu_map_idx);
goto out;
} }
if (evsel__is_drm(evsel)) { if (evsel__is_drm(evsel)) {
return evsel__drm_pmu_open(evsel, threads, err = evsel__drm_pmu_open(evsel, threads,
start_cpu_map_idx, start_cpu_map_idx,
end_cpu_map_idx); end_cpu_map_idx);
goto out;
} }
for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) { for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
@ -2689,7 +2695,8 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
} }
} }
return 0; err = 0;
goto out;
try_fallback: try_fallback:
if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus), if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus),
@ -2728,6 +2735,9 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
thread = nthreads; thread = nthreads;
} while (--idx >= 0); } while (--idx >= 0);
errno = old_errno; errno = old_errno;
out:
if (err)
evsel->supported = false;
return err; return err;
} }

View File

@ -121,7 +121,6 @@ struct evsel {
bool forced_leader; bool forced_leader;
bool cmdline_group_boundary; bool cmdline_group_boundary;
bool reset_group; bool reset_group;
bool errored;
bool needs_auxtrace_mmap; bool needs_auxtrace_mmap;
bool default_metricgroup; /* A member of the Default metricgroup */ bool default_metricgroup; /* A member of the Default metricgroup */
bool needs_uniquify; bool needs_uniquify;