mirror of https://github.com/torvalds/linux.git
perf/x86/intel: Process arch-PEBS records or record fragments
A significant difference with adaptive PEBS is that arch-PEBS record supports fragments which means an arch-PEBS record could be split into several independent fragments which have its own arch-PEBS header in each fragment. This patch defines architectural PEBS record layout structures and add helpers to process arch-PEBS records or fragments. Only legacy PEBS groups like basic, GPR, XMM and LBR groups are supported in this patch, the new added YMM/ZMM/OPMASK vector registers capturing would be supported in the future. Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20251029102136.61364-9-dapeng1.mi@linux.intel.com
This commit is contained in:
parent
167cde7dc9
commit
d21954c8a0
|
|
@ -3215,6 +3215,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
|||
status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Arch PEBS sets bit 54 in the global status register
|
||||
*/
|
||||
if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT,
|
||||
(unsigned long *)&status)) {
|
||||
handled++;
|
||||
static_call(x86_pmu_drain_pebs)(regs, &data);
|
||||
|
||||
if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
|
||||
is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
|
||||
status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Intel PT
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -2270,6 +2270,117 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
|||
format_group);
|
||||
}
|
||||
|
||||
static inline bool arch_pebs_record_continued(struct arch_pebs_header *header)
|
||||
{
|
||||
/* Continue bit or null PEBS record indicates fragment follows. */
|
||||
return header->cont || !(header->format & GENMASK_ULL(63, 16));
|
||||
}
|
||||
|
||||
static void setup_arch_pebs_sample_data(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
void *__pebs,
|
||||
struct perf_sample_data *data,
|
||||
struct pt_regs *regs)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
u64 sample_type = event->attr.sample_type;
|
||||
struct arch_pebs_header *header = NULL;
|
||||
struct arch_pebs_aux *meminfo = NULL;
|
||||
struct arch_pebs_gprs *gprs = NULL;
|
||||
struct x86_perf_regs *perf_regs;
|
||||
void *next_record;
|
||||
void *at = __pebs;
|
||||
|
||||
if (at == NULL)
|
||||
return;
|
||||
|
||||
perf_regs = container_of(regs, struct x86_perf_regs, regs);
|
||||
perf_regs->xmm_regs = NULL;
|
||||
|
||||
__setup_perf_sample_data(event, iregs, data);
|
||||
|
||||
*regs = *iregs;
|
||||
|
||||
again:
|
||||
header = at;
|
||||
next_record = at + sizeof(struct arch_pebs_header);
|
||||
if (header->basic) {
|
||||
struct arch_pebs_basic *basic = next_record;
|
||||
u16 retire = 0;
|
||||
|
||||
next_record = basic + 1;
|
||||
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
|
||||
retire = basic->valid ? basic->retire : 0;
|
||||
__setup_pebs_basic_group(event, regs, data, sample_type,
|
||||
basic->ip, basic->tsc, retire);
|
||||
}
|
||||
|
||||
/*
|
||||
* The record for MEMINFO is in front of GP
|
||||
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
|
||||
* Save the pointer here but process later.
|
||||
*/
|
||||
if (header->aux) {
|
||||
meminfo = next_record;
|
||||
next_record = meminfo + 1;
|
||||
}
|
||||
|
||||
if (header->gpr) {
|
||||
gprs = next_record;
|
||||
next_record = gprs + 1;
|
||||
|
||||
__setup_pebs_gpr_group(event, regs,
|
||||
(struct pebs_gprs *)gprs,
|
||||
sample_type);
|
||||
}
|
||||
|
||||
if (header->aux) {
|
||||
u64 ax = gprs ? gprs->ax : 0;
|
||||
|
||||
__setup_pebs_meminfo_group(event, data, sample_type,
|
||||
meminfo->cache_latency,
|
||||
meminfo->instr_latency,
|
||||
meminfo->address, meminfo->aux,
|
||||
meminfo->tsx_tuning, ax);
|
||||
}
|
||||
|
||||
if (header->xmm) {
|
||||
struct pebs_xmm *xmm;
|
||||
|
||||
next_record += sizeof(struct arch_pebs_xer_header);
|
||||
|
||||
xmm = next_record;
|
||||
perf_regs->xmm_regs = xmm->xmm;
|
||||
next_record = xmm + 1;
|
||||
}
|
||||
|
||||
if (header->lbr) {
|
||||
struct arch_pebs_lbr_header *lbr_header = next_record;
|
||||
struct lbr_entry *lbr;
|
||||
int num_lbr;
|
||||
|
||||
next_record = lbr_header + 1;
|
||||
lbr = next_record;
|
||||
|
||||
num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ?
|
||||
lbr_header->depth :
|
||||
header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES;
|
||||
next_record += num_lbr * sizeof(struct lbr_entry);
|
||||
|
||||
if (has_branch_stack(event)) {
|
||||
intel_pmu_store_pebs_lbrs(lbr);
|
||||
intel_pmu_lbr_save_brstack(data, cpuc, event);
|
||||
}
|
||||
}
|
||||
|
||||
/* Parse followed fragments if there are. */
|
||||
if (arch_pebs_record_continued(header)) {
|
||||
at = at + header->size;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void *
|
||||
get_next_pebs_record_by_bit(void *base, void *top, int bit)
|
||||
{
|
||||
|
|
@ -2753,6 +2864,78 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
|
|||
setup_pebs_adaptive_sample_data);
|
||||
}
|
||||
|
||||
static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
|
||||
struct perf_sample_data *data)
|
||||
{
|
||||
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
|
||||
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
union arch_pebs_index index;
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
void *base, *at, *top;
|
||||
u64 mask;
|
||||
|
||||
rdmsrq(MSR_IA32_PEBS_INDEX, index.whole);
|
||||
|
||||
if (unlikely(!index.wr)) {
|
||||
intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
|
||||
return;
|
||||
}
|
||||
|
||||
base = cpuc->ds_pebs_vaddr;
|
||||
top = (void *)((u64)cpuc->ds_pebs_vaddr +
|
||||
(index.wr << ARCH_PEBS_INDEX_WR_SHIFT));
|
||||
|
||||
index.wr = 0;
|
||||
index.full = 0;
|
||||
wrmsrq(MSR_IA32_PEBS_INDEX, index.whole);
|
||||
|
||||
mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
|
||||
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
/* Process all but the last event for each counter. */
|
||||
for (at = base; at < top;) {
|
||||
struct arch_pebs_header *header;
|
||||
struct arch_pebs_basic *basic;
|
||||
u64 pebs_status;
|
||||
|
||||
header = at;
|
||||
|
||||
if (WARN_ON_ONCE(!header->size))
|
||||
break;
|
||||
|
||||
/* 1st fragment or single record must have basic group */
|
||||
if (!header->basic) {
|
||||
at += header->size;
|
||||
continue;
|
||||
}
|
||||
|
||||
basic = at + sizeof(struct arch_pebs_header);
|
||||
pebs_status = mask & basic->applicable_counters;
|
||||
__intel_pmu_handle_pebs_record(iregs, regs, data, at,
|
||||
pebs_status, counts, last,
|
||||
setup_arch_pebs_sample_data);
|
||||
|
||||
/* Skip non-last fragments */
|
||||
while (arch_pebs_record_continued(header)) {
|
||||
if (!header->size)
|
||||
break;
|
||||
at += header->size;
|
||||
header = at;
|
||||
}
|
||||
|
||||
/* Skip last fragment or the single record */
|
||||
at += header->size;
|
||||
}
|
||||
|
||||
__intel_pmu_handle_last_pebs_record(iregs, regs, data, mask,
|
||||
counts, last,
|
||||
setup_arch_pebs_sample_data);
|
||||
}
|
||||
|
||||
static void __init intel_arch_pebs_init(void)
|
||||
{
|
||||
/*
|
||||
|
|
@ -2762,6 +2945,7 @@ static void __init intel_arch_pebs_init(void)
|
|||
*/
|
||||
x86_pmu.arch_pebs = 1;
|
||||
x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
|
||||
x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs;
|
||||
x86_pmu.pebs_capable = ~0ULL;
|
||||
|
||||
x86_pmu.pebs_enable = __intel_pmu_pebs_enable;
|
||||
|
|
|
|||
|
|
@ -327,6 +327,12 @@
|
|||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
|
||||
PERF_CAP_PEBS_TIMING_INFO)
|
||||
|
||||
/* Arch PEBS */
|
||||
#define MSR_IA32_PEBS_BASE 0x000003f4
|
||||
#define MSR_IA32_PEBS_INDEX 0x000003f5
|
||||
#define ARCH_PEBS_OFFSET_MASK 0x7fffff
|
||||
#define ARCH_PEBS_INDEX_WR_SHIFT 4
|
||||
|
||||
#define MSR_IA32_RTIT_CTL 0x00000570
|
||||
#define RTIT_CTL_TRACEEN BIT(0)
|
||||
#define RTIT_CTL_CYCLEACC BIT(1)
|
||||
|
|
|
|||
|
|
@ -437,6 +437,8 @@ static inline bool is_topdown_idx(int idx)
|
|||
#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
|
||||
#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55
|
||||
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
|
||||
#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54
|
||||
#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT)
|
||||
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
|
||||
|
||||
#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
|
||||
|
|
@ -507,6 +509,100 @@ struct pebs_cntr_header {
|
|||
|
||||
#define INTEL_CNTR_METRICS 0x3
|
||||
|
||||
/*
|
||||
* Arch PEBS
|
||||
*/
|
||||
union arch_pebs_index {
|
||||
struct {
|
||||
u64 rsvd:4,
|
||||
wr:23,
|
||||
rsvd2:4,
|
||||
full:1,
|
||||
en:1,
|
||||
rsvd3:3,
|
||||
thresh:23,
|
||||
rsvd4:5;
|
||||
};
|
||||
u64 whole;
|
||||
};
|
||||
|
||||
struct arch_pebs_header {
|
||||
union {
|
||||
u64 format;
|
||||
struct {
|
||||
u64 size:16, /* Record size */
|
||||
rsvd:14,
|
||||
mode:1, /* 64BIT_MODE */
|
||||
cont:1,
|
||||
rsvd2:3,
|
||||
cntr:5,
|
||||
lbr:2,
|
||||
rsvd3:7,
|
||||
xmm:1,
|
||||
ymmh:1,
|
||||
rsvd4:2,
|
||||
opmask:1,
|
||||
zmmh:1,
|
||||
h16zmm:1,
|
||||
rsvd5:5,
|
||||
gpr:1,
|
||||
aux:1,
|
||||
basic:1;
|
||||
};
|
||||
};
|
||||
u64 rsvd6;
|
||||
};
|
||||
|
||||
struct arch_pebs_basic {
|
||||
u64 ip;
|
||||
u64 applicable_counters;
|
||||
u64 tsc;
|
||||
u64 retire :16, /* Retire Latency */
|
||||
valid :1,
|
||||
rsvd :47;
|
||||
u64 rsvd2;
|
||||
u64 rsvd3;
|
||||
};
|
||||
|
||||
struct arch_pebs_aux {
|
||||
u64 address;
|
||||
u64 rsvd;
|
||||
u64 rsvd2;
|
||||
u64 rsvd3;
|
||||
u64 rsvd4;
|
||||
u64 aux;
|
||||
u64 instr_latency :16,
|
||||
pad2 :16,
|
||||
cache_latency :16,
|
||||
pad3 :16;
|
||||
u64 tsx_tuning;
|
||||
};
|
||||
|
||||
struct arch_pebs_gprs {
|
||||
u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
|
||||
u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp;
|
||||
u64 rsvd;
|
||||
};
|
||||
|
||||
struct arch_pebs_xer_header {
|
||||
u64 xstate;
|
||||
u64 rsvd;
|
||||
};
|
||||
|
||||
#define ARCH_PEBS_LBR_NAN 0x0
|
||||
#define ARCH_PEBS_LBR_NUM_8 0x1
|
||||
#define ARCH_PEBS_LBR_NUM_16 0x2
|
||||
#define ARCH_PEBS_LBR_NUM_VAR 0x3
|
||||
#define ARCH_PEBS_BASE_LBR_ENTRIES 8
|
||||
struct arch_pebs_lbr_header {
|
||||
u64 rsvd;
|
||||
u64 ctl;
|
||||
u64 depth;
|
||||
u64 ler_from;
|
||||
u64 ler_to;
|
||||
u64 ler_info;
|
||||
};
|
||||
|
||||
/*
|
||||
* AMD Extended Performance Monitoring and Debug cpuid feature detection
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue