From a9f16872642203761e0b6fa7c25ca4e286ab5083 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:15 -0400 Subject: [PATCH 01/42] tracing: Make trace_user_fault_read() exposed to rest of tracing The write to the trace_marker file is a critical section where it cannot take locks nor allocate memory. To read from user space, it allocates a per CPU buffer when the trace_marker file is opened, and then when the write system call is performed, it uses the following method to read from user space: preempt_disable(); buffer = per_cpu_ptr(cpu_buffers, cpu); do { cnt = nr_context_switches_cpu(); migrate_disable(); preempt_enable(); ret = copy_from_user(buffer, ptr, len); preempt_disable(); migrate_enable(); } while (!ret && cnt != nr_context_switches_cpu()); if (!ret) ring_buffer_write(buffer); preempt_enable(); It records the number of context switches for the current CPU, enables preemption, copies from user space, disable preemption and then checks if the number of context switches changed. If it did not, then the buffer is valid, otherwise the buffer may have been corrupted and the read from user space must be tried again. The system call trace events are now faultable and have the same restrictions as the trace_marker write. For system calls to read the user space buffer (for example to read the file of the openat system call), it needs the same logic. Instead of copying the code over to the system call trace events, make the code generic to allow the system call trace events to use the same code. The following API is added internally to the tracing sub system (these are only exposed within the tracing subsystem and not to be used outside of it): trace_user_fault_init() - initializes a trace_user_buf_info descriptor that will allocate the per CPU buffers to copy from user space into. trace_user_fault_destroy() - used to free the allocations made by trace_user_fault_init(). trace_user_fault_get() - update the ref count of the info descriptor to allow more than one user to use the same descriptor. trace_user_fault_put() - decrement the ref count. trace_user_fault_read() - performs the above action to read user space into the per CPU buffer. The preempt_disable() is expected before calling this function and preemption must remain disabled while the buffer returned is in use. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.096570057@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 256 ++++++++++++++++++++++++++++++++----------- kernel/trace/trace.h | 17 +++ 2 files changed, 208 insertions(+), 65 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..50832411c5c0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7223,52 +7223,43 @@ struct trace_user_buf { char *buf; }; -struct trace_user_buf_info { - struct trace_user_buf __percpu *tbuf; - int ref; -}; - - static DEFINE_MUTEX(trace_user_buffer_mutex); static struct trace_user_buf_info *trace_user_buffer; -static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) +/** + * trace_user_fault_destroy - free up allocated memory of a trace user buffer + * @tinfo: The descriptor to free up + * + * Frees any data allocated in the trace info dsecriptor. + */ +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) { char *buf; int cpu; + if (!tinfo || !tinfo->tbuf) + return; + for_each_possible_cpu(cpu) { buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; kfree(buf); } free_percpu(tinfo->tbuf); - kfree(tinfo); } -static int trace_user_fault_buffer_enable(void) +static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) { - struct trace_user_buf_info *tinfo; char *buf; int cpu; - guard(mutex)(&trace_user_buffer_mutex); - - if (trace_user_buffer) { - trace_user_buffer->ref++; - return 0; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - return -ENOMEM; + lockdep_assert_held(&trace_user_buffer_mutex); tinfo->tbuf = alloc_percpu(struct trace_user_buf); - if (!tinfo->tbuf) { - kfree(tinfo); + if (!tinfo->tbuf) return -ENOMEM; - } tinfo->ref = 1; + tinfo->size = size; /* Clear each buffer in case of error */ for_each_possible_cpu(cpu) { @@ -7276,42 +7267,165 @@ static int trace_user_fault_buffer_enable(void) } for_each_possible_cpu(cpu) { - buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, + buf = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); - if (!buf) { - trace_user_fault_buffer_free(tinfo); + if (!buf) return -ENOMEM; - } per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; } - trace_user_buffer = tinfo; - return 0; } -static void trace_user_fault_buffer_disable(void) +/* For internal use. Free and reinitialize */ +static void user_buffer_free(struct trace_user_buf_info **tinfo) { - struct trace_user_buf_info *tinfo; + lockdep_assert_held(&trace_user_buffer_mutex); + + trace_user_fault_destroy(*tinfo); + kfree(*tinfo); + *tinfo = NULL; +} + +/* For internal use. Initialize and allocate */ +static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) +{ + bool alloc = false; + int ret; + + lockdep_assert_held(&trace_user_buffer_mutex); + + if (!*tinfo) { + alloc = true; + *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL); + if (!*tinfo) + return -ENOMEM; + } + + ret = user_fault_buffer_enable(*tinfo, size); + if (ret < 0 && alloc) + user_buffer_free(tinfo); + + return ret; +} + +/* For internal use, derefrence and free if necessary */ +static void user_buffer_put(struct trace_user_buf_info **tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) + return; + + if (--(*tinfo)->ref) + return; + + user_buffer_free(tinfo); +} + +/** + * trace_user_fault_init - Allocated or reference a per CPU buffer + * @tinfo: A pointer to the trace buffer descriptor + * @size: The size to allocate each per CPU buffer + * + * Create a per CPU buffer that can be used to copy from user space + * in a task context. When calling trace_user_fault_read(), preemption + * must be disabled, and it will enable preemption and copy user + * space data to the buffer. If any schedule switches occur, it will + * retry until it succeeds without a schedule switch knowing the buffer + * is still valid. + * + * Returns 0 on success, negative on failure. + */ +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) +{ + int ret; + + if (!tinfo) + return -EINVAL; guard(mutex)(&trace_user_buffer_mutex); - tinfo = trace_user_buffer; + ret = user_buffer_init(&tinfo, size); + if (ret < 0) + trace_user_fault_destroy(tinfo); - if (WARN_ON_ONCE(!tinfo)) - return; - - if (--tinfo->ref) - return; - - trace_user_fault_buffer_free(tinfo); - trace_user_buffer = NULL; + return ret; } -/* Must be called with preemption disabled */ -static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, - const char __user *ptr, size_t size, - size_t *read_size) +/** + * trace_user_fault_get - up the ref count for the user buffer + * @tinfo: A pointer to a pointer to the trace buffer descriptor + * + * Ups the ref count of the trace buffer. + * + * Returns the new ref count. + */ +int trace_user_fault_get(struct trace_user_buf_info *tinfo) +{ + if (!tinfo) + return -1; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo->ref++; + return tinfo->ref; +} + +/** + * trace_user_fault_put - dereference a per cpu trace buffer + * @tinfo: The @tinfo that was passed to trace_user_fault_get() + * + * Decrement the ref count of @tinfo. + * + * Returns the new refcount (negative on error). + */ +int trace_user_fault_put(struct trace_user_buf_info *tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) + return -1; + + --tinfo->ref; + return tinfo->ref; +} + +/** + * trace_user_fault_read - Read user space into a per CPU buffer + * @tinfo: The @tinfo allocated by trace_user_fault_get() + * @ptr: The user space pointer to read + * @size: The size of user space to read. + * @copy_func: Optional function to use to copy from user space + * @data: Data to pass to copy_func if it was supplied + * + * Preemption must be disabled when this is called, and must not + * be enabled while using the returned buffer. + * This does the copying from user space into a per CPU buffer. + * + * The @size must not be greater than the size passed in to + * trace_user_fault_init(). + * + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), + * otherwise it will call @copy_func. It will call @copy_func with: + * + * buffer: the per CPU buffer of the @tinfo. + * ptr: The pointer @ptr to user space to read + * size: The @size of the ptr to read + * data: The @data parameter + * + * It is expected that @copy_func will return 0 on success and non zero + * if there was a fault. + * + * Returns a pointer to the buffer with the content read from @ptr. + * Preemption must remain disabled while the caller accesses the + * buffer returned by this function. + * Returns NULL if there was a fault, or the size passed in is + * greater than the size passed to trace_user_fault_init(). + */ +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data) { int cpu = smp_processor_id(); char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; @@ -7319,9 +7433,14 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, int trys = 0; int ret; - if (size > TRACE_MARKER_MAX_SIZE) - size = TRACE_MARKER_MAX_SIZE; - *read_size = 0; + lockdep_assert_preemption_disabled(); + + /* + * It's up to the caller to not try to copy more than it said + * it would. + */ + if (size > tinfo->size) + return NULL; /* * This acts similar to a seqcount. The per CPU context switches are @@ -7361,7 +7480,14 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, */ preempt_enable_notrace(); - ret = __copy_from_user(buffer, ptr, size); + /* Make sure preemption is enabled here */ + lockdep_assert_preemption_enabled(); + + if (copy_func) { + ret = copy_func(buffer, ptr, size, data); + } else { + ret = __copy_from_user(buffer, ptr, size); + } preempt_disable_notrace(); migrate_enable(); @@ -7378,7 +7504,6 @@ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, */ } while (nr_context_switches_cpu(cpu) != cnt); - *read_size = size; return buffer; } @@ -7389,7 +7514,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; unsigned long ip; - size_t size; char *buf; if (tracing_disabled) @@ -7407,13 +7531,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); if (!buf) return -EFAULT; - if (cnt > size) - cnt = size; - /* The selftests expect this function to be the IP address */ ip = _THIS_IP_; @@ -7473,7 +7594,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; - size_t size; char *buf; if (tracing_disabled) @@ -7486,17 +7606,17 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (cnt < sizeof(unsigned int)) return -EINVAL; + /* raw write is all or nothing */ + if (cnt > TRACE_MARKER_MAX_SIZE) + return -EINVAL; + /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); if (!buf) return -EFAULT; - /* raw write is all or nothing */ - if (cnt > size) - return -EINVAL; - /* The global trace_marker_raw can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); @@ -7516,20 +7636,26 @@ static int tracing_mark_open(struct inode *inode, struct file *filp) { int ret; - ret = trace_user_fault_buffer_enable(); - if (ret < 0) - return ret; + scoped_guard(mutex, &trace_user_buffer_mutex) { + if (!trace_user_buffer) { + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); + if (ret < 0) + return ret; + } else { + trace_user_buffer->ref++; + } + } stream_open(inode, filp); ret = tracing_open_generic_tr(inode, filp); if (ret < 0) - trace_user_fault_buffer_disable(); + user_buffer_put(&trace_user_buffer); return ret; } static int tracing_mark_release(struct inode *inode, struct file *file) { - trace_user_fault_buffer_disable(); + user_buffer_put(&trace_user_buffer); return tracing_release_generic_tr(inode, file); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 85eabb454bee..8439fe3058cc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1531,6 +1531,23 @@ void trace_buffered_event_enable(void); void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); +struct trace_user_buf; +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + size_t size; + int ref; +}; + +typedef int (*trace_user_buf_copy)(char *dst, const char __user *src, + size_t size, void *data); +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size); +int trace_user_fault_get(struct trace_user_buf_info *tinfo); +int trace_user_fault_put(struct trace_user_buf_info *tinfo); +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo); +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) From a544d9a66bdf20eb25cc40f99e1d09c825b71b26 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:16 -0400 Subject: [PATCH 02/42] tracing: Have syscall trace events read user space string As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()") system call trace events allow faulting in user space memory. Have some of the system call trace events take advantage of this. Use the trace_user_fault_read() logic to read the user space buffer from user space and instead of just saving the pointer to the buffer in the system call event, also save the string that is passed in. The syscall event has its nb_args shorten from an int to a short (where even u8 is plenty big enough) and the freed two bytes are used for "user_mask". The new "user_mask" field is used to store the index of the "args" field array that has the address to read from user space. This value is set to 0 if the system call event does not need to read user space for a field. This mask can be used to know if the event may fault or not. Only one bit set in user_mask is supported at this time. This allows the output to look like this: sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4) sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.261867956@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +- kernel/trace/trace_syscalls.c | 438 ++++++++++++++++++++++++++++++++-- 2 files changed, 421 insertions(+), 21 deletions(-) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 8e193f3a33b3..85f21ca15a41 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_mask: mask of @args that will read user space * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) * @enter_fields: list of fields for syscall_enter trace event @@ -25,7 +26,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - int nb_args; + short nb_args; + short user_mask; const char **types; const char **args; struct list_head enter_fields; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0f932b22f9ec..528ac90eda5d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -123,6 +124,9 @@ const char *get_syscall_name(int syscall) return entry->name; } +/* Added to user strings when max limit is reached */ +#define EXTRA "..." + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -132,7 +136,9 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall; + int i, syscall, val; + unsigned char *ptr; + int len; trace = (typeof(trace))ent; syscall = trace->nr; @@ -167,6 +173,19 @@ print_syscall_enter(struct trace_iterator *iter, int flags, else trace_seq_printf(s, "%s: 0x%lx", entry->args[i], trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)ent + (val & 0xffff); + len = val >> 16; + + trace_seq_printf(s, " \"%.*s\"", len, ptr); } trace_seq_putc(s, ')'); @@ -223,15 +242,27 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); + if (i) + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", + entry->args[i], sizeof(unsigned long)); + + if (!(BIT(i) & entry->user_mask)) + continue; + + /* Add the format for the user space string */ + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); + if (!(BIT(i) & entry->user_mask)) + continue; + /* The user space string for arg has name ___val */ + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); } #undef LEN_OR_ZERO @@ -277,8 +308,12 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; + unsigned long mask; + char *arg; int offset = offsetof(typeof(trace), args); + int idx; int ret = 0; + int len; int i; for (i = 0; i < meta->nb_args; i++) { @@ -291,9 +326,148 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) offset += sizeof(unsigned long); } + if (ret || !meta->user_mask) + return ret; + + mask = meta->user_mask; + idx = ffs(mask) - 1; + + /* + * User space strings are faulted into a temporary buffer and then + * added as a dynamic string to the end of the event. + * The user space string name for the arg pointer is "___val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) + kfree(arg); return ret; } +#define SYSCALL_FAULT_BUF_SZ 512 + +/* Use the tracing per CPU buffer infrastructure to copy from user space */ +struct syscall_user_buffer { + struct trace_user_buf_info buf; + struct rcu_head rcu; +}; + +static struct syscall_user_buffer *syscall_buffer; + +static int syscall_fault_buffer_enable(void) +{ + struct syscall_user_buffer *sbuf; + int ret; + + lockdep_assert_held(&syscall_trace_lock); + + if (syscall_buffer) { + trace_user_fault_get(&syscall_buffer->buf); + return 0; + } + + sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); + if (ret < 0) { + kfree(sbuf); + return ret; + } + + WRITE_ONCE(syscall_buffer, sbuf); + + return 0; +} + +static void rcu_free_syscall_buffer(struct rcu_head *rcu) +{ + struct syscall_user_buffer *sbuf = + container_of(rcu, struct syscall_user_buffer, rcu); + + trace_user_fault_destroy(&sbuf->buf); + kfree(sbuf); +} + + +static void syscall_fault_buffer_disable(void) +{ + struct syscall_user_buffer *sbuf = syscall_buffer; + + lockdep_assert_held(&syscall_trace_lock); + + if (trace_user_fault_put(&sbuf->buf)) + return; + + WRITE_ONCE(syscall_buffer, NULL); + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); +} + +static int syscall_copy_user(char *buf, const char __user *ptr, + size_t size, void *data) +{ + unsigned long *ret_size = data; + int ret; + + ret = strncpy_from_user(buf, ptr, size); + if (ret < 0) + return 1; + *ret_size = ret; + return 0; +} + +static char *sys_fault_user(struct syscall_metadata *sys_data, + struct syscall_user_buffer *sbuf, + unsigned long *args, unsigned int *data_size) +{ + unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; + unsigned long mask = sys_data->user_mask; + int idx = ffs(mask) - 1; + char *ptr; + char *buf; + + /* Get the pointer to user space memory to read */ + ptr = (char *)args[idx]; + *data_size = 0; + + buf = trace_user_fault_read(&sbuf->buf, ptr, size, + syscall_copy_user, &size); + if (!buf) + return NULL; + + /* Replace any non-printable characters with '.' */ + for (int i = 0; i < size; i++) { + if (!isprint(buf[i])) + buf[i] = '.'; + } + + /* + * If the text was truncated due to our max limit, add "..." to + * the string. + */ + if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { + strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), + EXTRA, sizeof(EXTRA)); + size = SYSCALL_FAULT_BUF_SZ; + } else { + buf[size++] = '\0'; + } + + *data_size = size; + return buf; +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -302,15 +476,17 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; + char *user_ptr; + int user_size = 0; int syscall_nr; - int size; + int size = 0; + bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -327,7 +503,32 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!sys_data) return; - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + guard(preempt_notrace)(); + + syscall_get_arguments(current, regs, args); + + if (mayfault) { + struct syscall_user_buffer *sbuf; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return; + + user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + size = 4 + user_size; + } + + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) @@ -335,9 +536,36 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) { + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. + */ + val = (ptr - (void *)entry) + 4; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_size << 16); + + /* Nothing to do if the user space was empty or faulted */ + if (user_size) { + /* Now store the user space data into the event */ + ptr += 4; + memcpy(ptr, user_ptr, user_size); + } + } + trace_event_buffer_commit(&fbuffer); } @@ -386,39 +614,50 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!tr->sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, tr); - if (!ret) { - WRITE_ONCE(tr->enter_syscall_files[num], file); - tr->sys_refcount_enter++; + guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; } - mutex_unlock(&syscall_trace_lock); - return ret; + if (!tr->sys_refcount_enter) { + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); + if (ret < 0) { + if (sys_data->user_mask) + syscall_fault_buffer_disable(); + return ret; + } + } + WRITE_ONCE(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); - mutex_unlock(&syscall_trace_lock); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, @@ -459,6 +698,163 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, mutex_unlock(&syscall_trace_lock); } +/* + * For system calls that reference user space memory that can + * be recorded into the event, set the system call meta data's user_mask + * to the "args" index that points to the user space memory to retrieve. + */ +static void check_faultable_syscall(struct trace_event_call *call, int nr) +{ + struct syscall_metadata *sys_data = call->data; + + /* Only work on entry */ + if (sys_data->enter_event != call) + return; + + switch (nr) { + /* user arg at position 0 */ +#ifdef __NR_access + case __NR_access: +#endif + case __NR_acct: + case __NR_add_key: /* Just _type. TODO add _description */ + case __NR_chdir: +#ifdef __NR_chown + case __NR_chown: +#endif +#ifdef __NR_chmod + case __NR_chmod: +#endif + case __NR_chroot: +#ifdef __NR_creat + case __NR_creat: +#endif + case __NR_delete_module: + case __NR_execve: + case __NR_fsopen: + case __NR_getxattr: /* Just pathname, TODO add name */ +#ifdef __NR_lchown + case __NR_lchown: +#endif + case __NR_lgetxattr: /* Just pathname, TODO add name */ + case __NR_lremovexattr: /* Just pathname, TODO add name */ +#ifdef __NR_link + case __NR_link: /* Just oldname. TODO add newname */ +#endif + case __NR_listxattr: /* Just pathname, TODO add list */ + case __NR_llistxattr: /* Just pathname, TODO add list */ + case __NR_lsetxattr: /* Just pathname, TODO add list */ +#ifdef __NR_open + case __NR_open: +#endif + case __NR_memfd_create: + case __NR_mount: /* Just dev_name, TODO add dir_name and type */ +#ifdef __NR_mkdir + case __NR_mkdir: +#endif +#ifdef __NR_mknod + case __NR_mknod: +#endif + case __NR_mq_open: + case __NR_mq_unlink: + case __NR_pivot_root: /* Just new_root, TODO add old_root */ +#ifdef __NR_readlink + case __NR_readlink: +#endif + case __NR_removexattr: /* Just pathname, TODO add name */ +#ifdef __NR_rename + case __NR_rename: /* Just oldname. TODO add newname */ +#endif + case __NR_request_key: /* Just _type. TODO add _description */ +#ifdef __NR_rmdir + case __NR_rmdir: +#endif + case __NR_setxattr: /* Just pathname, TODO add list */ + case __NR_shmdt: +#ifdef __NR_statfs + case __NR_statfs: +#endif + case __NR_swapon: + case __NR_swapoff: +#ifdef __NR_symlink + case __NR_symlink: /* Just oldname. TODO add newname */ +#endif +#ifdef __NR_truncate + case __NR_truncate: +#endif +#ifdef __NR_unlink + case __NR_unlink: +#endif + case __NR_umount2: +#ifdef __NR_utime + case __NR_utime: +#endif +#ifdef __NR_utimes + case __NR_utimes: +#endif + sys_data->user_mask = BIT(0); + break; + /* user arg at position 1 */ + case __NR_execveat: + case __NR_faccessat: + case __NR_faccessat2: + case __NR_finit_module: + case __NR_fchmodat: + case __NR_fchmodat2: + case __NR_fchownat: + case __NR_fgetxattr: + case __NR_flistxattr: + case __NR_fsetxattr: + case __NR_fspick: + case __NR_fremovexattr: +#ifdef __NR_futimesat + case __NR_futimesat: +#endif + case __NR_getxattrat: /* Just pathname, TODO add name */ + case __NR_inotify_add_watch: + case __NR_linkat: /* Just oldname. TODO add newname */ + case __NR_listxattrat: /* Just pathname, TODO add list */ + case __NR_mkdirat: + case __NR_mknodat: + case __NR_mount_setattr: + case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ + case __NR_name_to_handle_at: +#ifdef __NR_newfstatat + case __NR_newfstatat: +#endif + case __NR_openat: + case __NR_openat2: + case __NR_open_tree: + case __NR_open_tree_attr: + case __NR_readlinkat: +#ifdef __NR_renameat + case __NR_renameat: /* Just oldname. TODO add newname */ +#endif + case __NR_renameat2: /* Just oldname. TODO add newname */ + case __NR_removexattrat: /* Just pathname, TODO add name */ + case __NR_quotactl: + case __NR_setxattrat: /* Just pathname, TODO add list */ + case __NR_syslog: + case __NR_symlinkat: /* Just oldname. TODO add newname */ + case __NR_statx: + case __NR_unlinkat: + case __NR_utimensat: + sys_data->user_mask = BIT(1); + break; + /* user arg at position 2 */ + case __NR_init_module: + case __NR_fsconfig: + sys_data->user_mask = BIT(2); + break; + /* user arg at position 4 */ + case __NR_fanotify_mark: + sys_data->user_mask = BIT(4); + break; + default: + sys_data->user_mask = 0; + } +} + static int __init init_syscall_trace(struct trace_event_call *call) { int id; @@ -471,6 +867,8 @@ static int __init init_syscall_trace(struct trace_event_call *call) return -ENOSYS; } + check_faultable_syscall(call, num); + if (set_syscall_print_fmt(call) < 0) return -ENOMEM; From bd1b80fba71a54b1369967e52d249877f1a2b86d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:17 -0400 Subject: [PATCH 03/42] perf: tracing: Simplify perf_sysenter_enable/disable() with guards Use guard(mutex)(&syscall_trace_lock) for perf_sysenter_enable() and perf_sysenter_disable() as well as for the perf_sysexit_enable() and perf_sysexit_disable(). This will make it easier to update these functions with other code that has early exit handling. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.429583335@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 48 ++++++++++++++++------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 528ac90eda5d..42d066d8c0ab 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1049,22 +1049,21 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static int perf_sysenter_enable(struct trace_event_call *call) { - int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_enter) - ret = register_trace_sys_enter(perf_syscall_enter, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall entry trace point"); - } else { - set_bit(num, enabled_perf_enter_syscalls); - sys_perf_refcount_enter++; + guard(mutex)(&syscall_trace_lock); + if (!sys_perf_refcount_enter) { + int ret = register_trace_sys_enter(perf_syscall_enter, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall entry trace point"); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + return 0; } static void perf_sysenter_disable(struct trace_event_call *call) @@ -1073,12 +1072,11 @@ static void perf_sysenter_disable(struct trace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, @@ -1155,22 +1153,21 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) static int perf_sysexit_enable(struct trace_event_call *call) { - int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit, NULL); - if (ret) { - pr_info("event trace: Could not activate syscall exit trace point"); - } else { - set_bit(num, enabled_perf_exit_syscalls); - sys_perf_refcount_exit++; + guard(mutex)(&syscall_trace_lock); + if (!sys_perf_refcount_exit) { + int ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall exit trace point"); + return ret; + } } - mutex_unlock(&syscall_trace_lock); - return ret; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + return 0; } static void perf_sysexit_disable(struct trace_event_call *call) @@ -1179,12 +1176,11 @@ static void perf_sysexit_disable(struct trace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; - mutex_lock(&syscall_trace_lock); + guard(mutex)(&syscall_trace_lock); sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) unregister_trace_sys_exit(perf_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ From 2e82e256df1961ecb031fbd7ee28e95a5dc87003 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:18 -0400 Subject: [PATCH 04/42] perf: tracing: Have perf system calls read user space Allow some of the system call events to read user space buffers. Instead of just showing the pointer into user space, allow perf events to also record the content of those pointers. For example: # perf record -e syscalls:sys_enter_openat ls /usr/bin [..] # perf script ls 1024 [005] 52.902721: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbae321c "/etc/ld.so.cache", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.902899: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae140 "/lib/x86_64-linux-gnu/libselinux.so.1", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.903471: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae690 "/lib/x86_64-linux-gnu/libcap.so.2", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.903946: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaebe0 "/lib/x86_64-linux-gnu/libc.so.6", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.904629: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaf110 "/lib/x86_64-linux-gnu/libpcre2-8.so.0", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.906985: syscalls:sys_enter_openat: dfd: 0xffffffffffffff9c, filename: 0x7fc1dba92904 "/proc/filesystems", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.907323: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dba19490 "/usr/lib/locale/locale-archive", flags: 0x00080000, mode: 0x00000000 ls 1024 [005] 52.907746: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x556fb888dcd0 "/usr/bin", flags: 0x00090800, mode: 0x00000000 Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.593925979@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 136 ++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 46 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 42d066d8c0ab..ed9332f8bdf8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -468,6 +468,58 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, return buf; } +static int +syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, + char **buffer, int *size, int *user_size) +{ + struct syscall_user_buffer *sbuf; + + /* If the syscall_buffer is NULL, tracing is being shutdown */ + sbuf = READ_ONCE(syscall_buffer); + if (!sbuf) + return -1; + + *buffer = sys_fault_user(sys_data, sbuf, args, user_size); + /* + * user_size is the amount of data to append. + * Need to add 4 for the meta field that points to + * the user memory at the end of the event and also + * stores its size. + */ + *size = 4 + *user_size; + return 0; +} + +static void syscall_put_data(struct syscall_metadata *sys_data, + struct syscall_trace_enter *entry, + char *buffer, int size) +{ + void *ptr; + int val; + + /* + * Set the pointer to point to the meta data of the event + * that has information about the stored user space memory. + */ + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; + + /* + * The meta data will store the offset of the user data from + * the beginning of the event. + */ + val = (ptr - (void *)entry) + 4; + + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (size << 16); + + /* Nothing to do if the user space was empty or faulted */ + if (size) { + /* Now store the user space data into the event */ + ptr += 4; + memcpy(ptr, buffer, size); + } +} + static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -511,21 +563,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); if (mayfault) { - struct syscall_user_buffer *sbuf; - - /* If the syscall_buffer is NULL, tracing is being shutdown */ - sbuf = READ_ONCE(syscall_buffer); - if (!sbuf) + if (syscall_get_data(sys_data, args, &user_ptr, + &size, &user_size) < 0) return; - - user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size); - /* - * user_size is the amount of data to append. - * Need to add 4 for the meta field that points to - * the user memory at the end of the event and also - * stores its size. - */ - size = 4 + user_size; } size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; @@ -539,32 +579,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); - if (mayfault) { - void *ptr; - int val; - - /* - * Set the pointer to point to the meta data of the event - * that has information about the stored user space memory. - */ - ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; - - /* - * The meta data will store the offset of the user data from - * the beginning of the event. - */ - val = (ptr - (void *)entry) + 4; - - /* Store the offset and the size into the meta data */ - *(int *)ptr = val | (user_size << 16); - - /* Nothing to do if the user space was empty or faulted */ - if (user_size) { - /* Now store the user space data into the event */ - ptr += 4; - memcpy(ptr, user_ptr, user_size); - } - } + if (mayfault) + syscall_put_data(sys_data, entry, user_ptr, user_size); trace_event_buffer_commit(&fbuffer); } @@ -996,9 +1012,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; + bool mayfault; + char *user_ptr; int syscall_nr; + int user_size; int rctx; - int size; + int size = 0; /* * Syscall probe called with preemption enabled, but the ring @@ -1017,13 +1036,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + syscall_get_arguments(current, regs, args); + + /* Check if this syscall event faults in user space memory */ + mayfault = sys_data->user_mask != 0; + + if (mayfault) { + if (syscall_get_data(sys_data, args, &user_ptr, + &size, &user_size) < 0) + return; + } + head = this_cpu_ptr(sys_data->enter_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1032,9 +1062,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, args); memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); + if (mayfault) + syscall_put_data(sys_data, rec, user_ptr, user_size); + if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { @@ -1049,15 +1081,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static int perf_sysenter_enable(struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; int num; + int ret; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; guard(mutex)(&syscall_trace_lock); + if (sys_data->user_mask) { + ret = syscall_fault_buffer_enable(); + if (ret < 0) + return ret; + } if (!sys_perf_refcount_enter) { - int ret = register_trace_sys_enter(perf_syscall_enter, NULL); + ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { pr_info("event trace: Could not activate syscall entry trace point"); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); return ret; } } @@ -1068,15 +1109,18 @@ static int perf_sysenter_enable(struct trace_event_call *call) static void perf_sysenter_disable(struct trace_event_call *call) { + struct syscall_metadata *sys_data = call->data; int num; - num = ((struct syscall_metadata *)call->data)->syscall_nr; + num = sys_data->syscall_nr; guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); + if (sys_data->user_mask) + syscall_fault_buffer_disable(); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, From b4f7624cfc9422209b844793521c60edb289fb69 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:19 -0400 Subject: [PATCH 05/42] tracing: Have system call events record user array data For system call events that have a length field, add a "user_arg_size" parameter to the system call meta data that denotes the index of the args array that holds the size of arg that the user_mask field has a bit set for. The "user_mask" has a bit set that denotes the arg that points to an array in the user space address space and if a system call event has the user_mask field set and the user_arg_size set, it will then record the content of that address into the trace event, up to the size defined by SYSCALL_FAULT_BUF_SZ - 1. This allows the output to look like: sys_write(fd: 0xa, buf: 0x5646978d13c0 (01:00:05:00:00:00:00:00:01:87:55:89:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00), count: 0x20) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.763528474@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +- kernel/trace/trace_syscalls.c | 121 ++++++++++++++++++++++++---------- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 85f21ca15a41..9413c139da66 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_arg_size: holds @arg that has size of the user space to read * @user_mask: mask of @args that will read user space * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) @@ -26,7 +27,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - short nb_args; + u8 nb_args; + s8 user_arg_size; short user_mask; const char **types; const char **args; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ed9332f8bdf8..3f3cdfc9958e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -124,7 +124,7 @@ const char *get_syscall_name(int syscall) return entry->name; } -/* Added to user strings when max limit is reached */ +/* Added to user strings or arrays when max limit is reached */ #define EXTRA "..." static enum print_line_t @@ -136,9 +136,8 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, syscall, val; + int i, syscall, val, len; unsigned char *ptr; - int len; trace = (typeof(trace))ent; syscall = trace->nr; @@ -185,7 +184,23 @@ print_syscall_enter(struct trace_iterator *iter, int flags, ptr = (void *)ent + (val & 0xffff); len = val >> 16; - trace_seq_printf(s, " \"%.*s\"", len, ptr); + if (entry->user_arg_size < 0) { + trace_seq_printf(s, " \"%.*s\"", len, ptr); + continue; + } + + val = trace->args[entry->user_arg_size]; + + trace_seq_puts(s, " ("); + for (int x = 0; x < len; x++, ptr++) { + if (x) + trace_seq_putc(s, ':'); + trace_seq_printf(s, "%02x", *ptr); + } + if (len < val) + trace_seq_printf(s, ", %s", EXTRA); + + trace_seq_putc(s, ')'); } trace_seq_putc(s, ')'); @@ -250,8 +265,11 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) if (!(BIT(i) & entry->user_mask)) continue; - /* Add the format for the user space string */ - pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); + /* Add the format for the user space string or array */ + if (entry->user_arg_size < 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -260,9 +278,14 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) ", ((unsigned long)(REC->%s))", entry->args[i]); if (!(BIT(i) & entry->user_mask)) continue; - /* The user space string for arg has name ___val */ - pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", - entry->args[i]); + /* The user space data for arg has name ___val */ + if (entry->user_arg_size < 0) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", + entry->args[i]); + } else { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", + entry->args[i]); + } } #undef LEN_OR_ZERO @@ -333,9 +356,9 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) idx = ffs(mask) - 1; /* - * User space strings are faulted into a temporary buffer and then - * added as a dynamic string to the end of the event. - * The user space string name for the arg pointer is "___val". + * User space data is faulted into a temporary buffer and then + * added as a dynamic string or array to the end of the event. + * The user space data name for the arg pointer is "___val". */ len = strlen(meta->args[idx]) + sizeof("___val"); arg = kmalloc(len, GFP_KERNEL); @@ -431,9 +454,11 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_user_buffer *sbuf, unsigned long *args, unsigned int *data_size) { + trace_user_buf_copy syscall_copy = syscall_copy_user; unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; unsigned long mask = sys_data->user_mask; int idx = ffs(mask) - 1; + bool array = false; char *ptr; char *buf; @@ -441,27 +466,43 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, ptr = (char *)args[idx]; *data_size = 0; + /* + * If this system call event has a size argument, use + * it to define how much of user space memory to read, + * and read it as an array and not a string. + */ + if (sys_data->user_arg_size >= 0) { + array = true; + size = args[sys_data->user_arg_size]; + if (size > SYSCALL_FAULT_BUF_SZ - 1) + size = SYSCALL_FAULT_BUF_SZ - 1; + /* use normal copy_from_user() */ + syscall_copy = NULL; + } + buf = trace_user_fault_read(&sbuf->buf, ptr, size, - syscall_copy_user, &size); + syscall_copy, &size); if (!buf) return NULL; - /* Replace any non-printable characters with '.' */ - for (int i = 0; i < size; i++) { - if (!isprint(buf[i])) - buf[i] = '.'; - } + /* For strings, replace any non-printable characters with '.' */ + if (!array) { + for (int i = 0; i < size; i++) { + if (!isprint(buf[i])) + buf[i] = '.'; + } - /* - * If the text was truncated due to our max limit, add "..." to - * the string. - */ - if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { - strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), - EXTRA, sizeof(EXTRA)); - size = SYSCALL_FAULT_BUF_SZ; - } else { - buf[size++] = '\0'; + /* + * If the text was truncated due to our max limit, add "..." to + * the string. + */ + if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { + strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), + EXTRA, sizeof(EXTRA)); + size = SYSCALL_FAULT_BUF_SZ; + } else { + buf[size++] = '\0'; + } } *data_size = size; @@ -492,7 +533,7 @@ syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, static void syscall_put_data(struct syscall_metadata *sys_data, struct syscall_trace_enter *entry, - char *buffer, int size) + char *buffer, int size, int user_size) { void *ptr; int val; @@ -510,13 +551,16 @@ static void syscall_put_data(struct syscall_metadata *sys_data, val = (ptr - (void *)entry) + 4; /* Store the offset and the size into the meta data */ - *(int *)ptr = val | (size << 16); + *(int *)ptr = val | (user_size << 16); + + if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size)) + user_size = 0; /* Nothing to do if the user space was empty or faulted */ - if (size) { + if (user_size) { /* Now store the user space data into the event */ ptr += 4; - memcpy(ptr, buffer, size); + memcpy(ptr, buffer, user_size); } } @@ -580,7 +624,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) - syscall_put_data(sys_data, entry, user_ptr, user_size); + syscall_put_data(sys_data, entry, user_ptr, size, user_size); trace_event_buffer_commit(&fbuffer); } @@ -727,7 +771,16 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) if (sys_data->enter_event != call) return; + sys_data->user_arg_size = -1; + switch (nr) { + /* user arg 1 with size arg at 2 */ + case __NR_write: + case __NR_mq_timedsend: + case __NR_pwrite64: + sys_data->user_mask = BIT(1); + sys_data->user_arg_size = 2; + break; /* user arg at position 0 */ #ifdef __NR_access case __NR_access: @@ -1065,7 +1118,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) - syscall_put_data(sys_data, rec, user_ptr, user_size); + syscall_put_data(sys_data, rec, user_ptr, size, user_size); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || From 011ea0501daaba36c06910fd383cf7428ea45844 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:20 -0400 Subject: [PATCH 06/42] tracing: Display some syscall arrays as strings Some of the system calls that read a fixed length of memory from the user space address are not arrays but strings. Take a bit away from the nb_args field in the syscall meta data to use as a flag to denote that the system call's user_arg_size is being used as a string. The nb_args should never be more than 6, so 7 bits is plenty to hold that number. When the user_arg_is_str flag that, when set, will display the data array from the user space address as a string and not an array. This will allow the output to look like this: sys_sethostname(name: 0x5584310eb2a0 "debian", len: 6) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.930550359@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +++- kernel/trace/trace_syscalls.c | 22 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 9413c139da66..0dd7f2b33431 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_arg_is_str: set if the arg for @user_arg_size is a string * @user_arg_size: holds @arg that has size of the user space to read * @user_mask: mask of @args that will read user space * @types: list of types as strings @@ -27,7 +28,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - u8 nb_args; + u8 nb_args:7; + u8 user_arg_is_str:1; s8 user_arg_size; short user_mask; const char **types; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 3f3cdfc9958e..b8e9774a8abd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -184,7 +184,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, ptr = (void *)ent + (val & 0xffff); len = val >> 16; - if (entry->user_arg_size < 0) { + if (entry->user_arg_size < 0 || entry->user_arg_is_str) { trace_seq_printf(s, " \"%.*s\"", len, ptr); continue; } @@ -249,6 +249,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { + bool is_string = entry->user_arg_is_str; int i; int pos = 0; @@ -266,7 +267,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) continue; /* Add the format for the user space string or array */ - if (entry->user_arg_size < 0) + if (entry->user_arg_size < 0 || is_string) pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); else pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); @@ -279,7 +280,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) if (!(BIT(i) & entry->user_mask)) continue; /* The user space data for arg has name ___val */ - if (entry->user_arg_size < 0) { + if (entry->user_arg_size < 0 || is_string) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", entry->args[i]); } else { @@ -781,6 +782,21 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) sys_data->user_mask = BIT(1); sys_data->user_arg_size = 2; break; + /* user arg 0 with size arg at 1 as string */ + case __NR_setdomainname: + case __NR_sethostname: + sys_data->user_mask = BIT(0); + sys_data->user_arg_size = 1; + sys_data->user_arg_is_str = 1; + break; +#ifdef __NR_kexec_file_load + /* user arg 4 with size arg at 3 as string */ + case __NR_kexec_file_load: + sys_data->user_mask = BIT(4); + sys_data->user_arg_size = 3; + sys_data->user_arg_is_str = 1; + break; +#endif /* user arg at position 0 */ #ifdef __NR_access case __NR_access: From baa031b7bd2ce7502339174a42974321859ecd6a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:21 -0400 Subject: [PATCH 07/42] tracing: Allow syscall trace events to read more than one user parameter Allow more than one field of a syscall trace event to read user space. Build on top of the user_mask by allowing more than one bit to be set that corresponds to the @args array of the syscall metadata. For each argument in the @args array that is to be read, it will have a dynamic array/string field associated to it. Note that multiple fields to be read from user space is not supported if the user_arg_size field is set in the syscall metada. That field can only be used if only one field is being read from user space as that field is a number representing the size field of the syscall event that holds the size of the data to read from user space. It becomes ambiguous if the system call reads more than one field. Currently this is not an issue. If a syscall event happens to enable two events to read user space and sets the user_arg_size field, it will trigger a warning at boot and the user_arg_size field will be cleared. The per CPU buffer that is used to read the user space addresses is now broken up into 3 sections, each of 168 bytes. The reason for 168 is that it is the biggest portion of 512 bytes divided by 3 that is 8 byte aligned. The max amount copied into the ring buffer from user space is now only 128 bytes, which is plenty. When reading user space, it still reads 167 (168-1) bytes and uses the remaining to know if it should append the extra "..." to the end or not. This will allow the event to look like this: sys_renameat2(olddfd: 0xffffff9c, oldname: 0x7ffe02facdff "/tmp/x", newdfd: 0xffffff9c, newname: 0x7ffe02face06 "/tmp/y", flags: 1) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.095789277@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 339 +++++++++++++++++++++++----------- 1 file changed, 230 insertions(+), 109 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b8e9774a8abd..3eafe1b8f53e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -138,6 +138,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct syscall_metadata *entry; int i, syscall, val, len; unsigned char *ptr; + int offset = 0; trace = (typeof(trace))ent; syscall = trace->nr; @@ -177,12 +178,13 @@ print_syscall_enter(struct trace_iterator *iter, int flags, continue; /* This arg points to a user space string */ - ptr = (void *)trace->args + sizeof(long) * entry->nb_args; + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; val = *(int *)ptr; /* The value is a dynamic string (len << 16 | offset) */ ptr = (void *)ent + (val & 0xffff); len = val >> 16; + offset += 4; if (entry->user_arg_size < 0 || entry->user_arg_is_str) { trace_seq_printf(s, " \"%.*s\"", len, ptr); @@ -335,7 +337,6 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) unsigned long mask; char *arg; int offset = offsetof(typeof(trace), args); - int idx; int ret = 0; int len; int i; @@ -354,31 +355,56 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) return ret; mask = meta->user_mask; - idx = ffs(mask) - 1; - /* - * User space data is faulted into a temporary buffer and then - * added as a dynamic string or array to the end of the event. - * The user space data name for the arg pointer is "___val". - */ - len = strlen(meta->args[idx]) + sizeof("___val"); - arg = kmalloc(len, GFP_KERNEL); - if (WARN_ON_ONCE(!arg)) { - meta->user_mask = 0; - return -ENOMEM; + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + /* + * User space data is faulted into a temporary buffer and then + * added as a dynamic string or array to the end of the event. + * The user space data name for the arg pointer is + * "___val". + */ + len = strlen(meta->args[idx]) + sizeof("___val"); + arg = kmalloc(len, GFP_KERNEL); + if (WARN_ON_ONCE(!arg)) { + meta->user_mask = 0; + return -ENOMEM; + } + + snprintf(arg, len, "__%s_val", meta->args[idx]); + + ret = trace_define_field(call, "__data_loc char[]", + arg, offset, sizeof(int), 0, + FILTER_OTHER); + if (ret) { + kfree(arg); + break; + } + offset += 4; } - - snprintf(arg, len, "__%s_val", meta->args[idx]); - - ret = trace_define_field(call, "__data_loc char[]", - arg, offset, sizeof(int), 0, - FILTER_OTHER); - if (ret) - kfree(arg); return ret; } +/* + * Create a per CPU temporary buffer to copy user space pointers into. + * + * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use + * to copy memory from user space addresses into. + * + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space. + * + * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer. + * It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it + * needs to append the EXTRA or not. + * + * This only allows up to 3 args from system calls. + */ #define SYSCALL_FAULT_BUF_SZ 512 +#define SYSCALL_FAULT_ARG_SZ 168 +#define SYSCALL_FAULT_USER_MAX 128 +#define SYSCALL_FAULT_MAX_CNT 3 /* Use the tracing per CPU buffer infrastructure to copy from user space */ struct syscall_user_buffer { @@ -438,34 +464,58 @@ static void syscall_fault_buffer_disable(void) call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); } +struct syscall_args { + char *ptr_array[SYSCALL_FAULT_MAX_CNT]; + int read[SYSCALL_FAULT_MAX_CNT]; + int uargs; +}; + static int syscall_copy_user(char *buf, const char __user *ptr, size_t size, void *data) { - unsigned long *ret_size = data; + struct syscall_args *args = data; int ret; - ret = strncpy_from_user(buf, ptr, size); - if (ret < 0) - return 1; - *ret_size = ret; + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = strncpy_from_user(buf, ptr, size); + args->read[i] = ret; + } + return 0; +} + +static int syscall_copy_user_array(char *buf, const char __user *ptr, + size_t size, void *data) +{ + struct syscall_args *args = data; + int ret; + + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + ptr = (char __user *)args->ptr_array[i]; + ret = __copy_from_user(buf, ptr, size); + args->read[i] = ret ? -1 : size; + } return 0; } static char *sys_fault_user(struct syscall_metadata *sys_data, struct syscall_user_buffer *sbuf, - unsigned long *args, unsigned int *data_size) + unsigned long *args, + unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) { trace_user_buf_copy syscall_copy = syscall_copy_user; - unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; unsigned long mask = sys_data->user_mask; - int idx = ffs(mask) - 1; + unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; + struct syscall_args sargs; bool array = false; - char *ptr; + char *buffer; char *buf; + int ret; + int i = 0; - /* Get the pointer to user space memory to read */ - ptr = (char *)args[idx]; - *data_size = 0; + /* The extra is appended to the user data in the buffer */ + BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= + SYSCALL_FAULT_ARG_SZ); /* * If this system call event has a size argument, use @@ -475,67 +525,103 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, if (sys_data->user_arg_size >= 0) { array = true; size = args[sys_data->user_arg_size]; - if (size > SYSCALL_FAULT_BUF_SZ - 1) - size = SYSCALL_FAULT_BUF_SZ - 1; - /* use normal copy_from_user() */ - syscall_copy = NULL; + if (size > SYSCALL_FAULT_ARG_SZ - 1) + size = SYSCALL_FAULT_ARG_SZ - 1; + syscall_copy = syscall_copy_user_array; } - buf = trace_user_fault_read(&sbuf->buf, ptr, size, - syscall_copy, &size); - if (!buf) + while (mask) { + int idx = ffs(mask) - 1; + mask &= ~BIT(idx); + + if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) + break; + + /* Get the pointer to user space memory to read */ + sargs.ptr_array[i++] = (char *)args[idx]; + } + + sargs.uargs = i; + + /* Clear the values that are not used */ + for (; i < SYSCALL_FAULT_MAX_CNT; i++) { + data_size[i] = -1; /* Denotes no pointer */ + } + + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, + syscall_copy, &sargs); + if (!buffer) return NULL; - /* For strings, replace any non-printable characters with '.' */ - if (!array) { - for (int i = 0; i < size; i++) { - if (!isprint(buf[i])) - buf[i] = '.'; - } + buf = buffer; + for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { - /* - * If the text was truncated due to our max limit, add "..." to - * the string. - */ - if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { - strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), - EXTRA, sizeof(EXTRA)); - size = SYSCALL_FAULT_BUF_SZ; + ret = sargs.read[i]; + if (ret < 0) + continue; + buf[ret] = '\0'; + + /* For strings, replace any non-printable characters with '.' */ + if (!array) { + for (int x = 0; x < ret; x++) { + if (!isprint(buf[x])) + buf[x] = '.'; + } + + /* + * If the text was truncated due to our max limit, + * add "..." to the string. + */ + if (ret > SYSCALL_FAULT_USER_MAX) { + strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA, + sizeof(EXTRA)); + ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA); + } else { + buf[ret++] = '\0'; + } } else { - buf[size++] = '\0'; + ret = min(ret, SYSCALL_FAULT_USER_MAX); } + data_size[i] = ret; } - *data_size = size; - return buf; + return buffer; } static int syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, - char **buffer, int *size, int *user_size) + char **buffer, int *size, int *user_sizes, int *uargs) { struct syscall_user_buffer *sbuf; + int i; /* If the syscall_buffer is NULL, tracing is being shutdown */ sbuf = READ_ONCE(syscall_buffer); if (!sbuf) return -1; - *buffer = sys_fault_user(sys_data, sbuf, args, user_size); + *buffer = sys_fault_user(sys_data, sbuf, args, user_sizes); /* * user_size is the amount of data to append. * Need to add 4 for the meta field that points to * the user memory at the end of the event and also * stores its size. */ - *size = 4 + *user_size; + for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { + if (user_sizes[i] < 0) + break; + *size += user_sizes[i] + 4; + } + /* Save the number of user read arguments of this syscall */ + *uargs = i; return 0; } static void syscall_put_data(struct syscall_metadata *sys_data, struct syscall_trace_enter *entry, - char *buffer, int size, int user_size) + char *buffer, int size, int *user_sizes, int uargs) { + char *buf = buffer; void *ptr; int val; @@ -547,21 +633,30 @@ static void syscall_put_data(struct syscall_metadata *sys_data, /* * The meta data will store the offset of the user data from - * the beginning of the event. + * the beginning of the event. That is after the static arguments + * and the meta data fields. */ - val = (ptr - (void *)entry) + 4; + val = (ptr - (void *)entry) + 4 * uargs; - /* Store the offset and the size into the meta data */ - *(int *)ptr = val | (user_size << 16); + for (int i = 0; i < uargs; i++) { - if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size)) - user_size = 0; + if (i) + val += user_sizes[i - 1]; - /* Nothing to do if the user space was empty or faulted */ - if (user_size) { - /* Now store the user space data into the event */ + /* Store the offset and the size into the meta data */ + *(int *)ptr = val | (user_sizes[i] << 16); + + /* Skip the meta data */ ptr += 4; - memcpy(ptr, buffer, user_size); + } + + for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { + /* Nothing to do if the user space was empty or faulted */ + if (!user_sizes[i]) + continue; + + memcpy(ptr, buf, user_sizes[i]); + ptr += user_sizes[i]; } } @@ -574,9 +669,10 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct trace_event_buffer fbuffer; unsigned long args[6]; char *user_ptr; - int user_size = 0; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; int size = 0; + int uargs = 0; bool mayfault; /* @@ -609,7 +705,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, - &size, &user_size) < 0) + &size, user_sizes, &uargs) < 0) return; } @@ -625,7 +721,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) - syscall_put_data(sys_data, entry, user_ptr, size, user_size); + syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); trace_event_buffer_commit(&fbuffer); } @@ -767,6 +863,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, static void check_faultable_syscall(struct trace_event_call *call, int nr) { struct syscall_metadata *sys_data = call->data; + unsigned long mask; /* Only work on entry */ if (sys_data->enter_event != call) @@ -802,7 +899,6 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) case __NR_access: #endif case __NR_acct: - case __NR_add_key: /* Just _type. TODO add _description */ case __NR_chdir: #ifdef __NR_chown case __NR_chown: @@ -817,23 +913,13 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) case __NR_delete_module: case __NR_execve: case __NR_fsopen: - case __NR_getxattr: /* Just pathname, TODO add name */ #ifdef __NR_lchown case __NR_lchown: #endif - case __NR_lgetxattr: /* Just pathname, TODO add name */ - case __NR_lremovexattr: /* Just pathname, TODO add name */ -#ifdef __NR_link - case __NR_link: /* Just oldname. TODO add newname */ -#endif - case __NR_listxattr: /* Just pathname, TODO add list */ - case __NR_llistxattr: /* Just pathname, TODO add list */ - case __NR_lsetxattr: /* Just pathname, TODO add list */ #ifdef __NR_open case __NR_open: #endif case __NR_memfd_create: - case __NR_mount: /* Just dev_name, TODO add dir_name and type */ #ifdef __NR_mkdir case __NR_mkdir: #endif @@ -842,28 +928,18 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) #endif case __NR_mq_open: case __NR_mq_unlink: - case __NR_pivot_root: /* Just new_root, TODO add old_root */ #ifdef __NR_readlink case __NR_readlink: #endif - case __NR_removexattr: /* Just pathname, TODO add name */ -#ifdef __NR_rename - case __NR_rename: /* Just oldname. TODO add newname */ -#endif - case __NR_request_key: /* Just _type. TODO add _description */ #ifdef __NR_rmdir case __NR_rmdir: #endif - case __NR_setxattr: /* Just pathname, TODO add list */ case __NR_shmdt: #ifdef __NR_statfs case __NR_statfs: #endif case __NR_swapon: case __NR_swapoff: -#ifdef __NR_symlink - case __NR_symlink: /* Just oldname. TODO add newname */ -#endif #ifdef __NR_truncate case __NR_truncate: #endif @@ -895,14 +971,10 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) #ifdef __NR_futimesat case __NR_futimesat: #endif - case __NR_getxattrat: /* Just pathname, TODO add name */ case __NR_inotify_add_watch: - case __NR_linkat: /* Just oldname. TODO add newname */ - case __NR_listxattrat: /* Just pathname, TODO add list */ case __NR_mkdirat: case __NR_mknodat: case __NR_mount_setattr: - case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ case __NR_name_to_handle_at: #ifdef __NR_newfstatat case __NR_newfstatat: @@ -912,15 +984,8 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) case __NR_open_tree: case __NR_open_tree_attr: case __NR_readlinkat: -#ifdef __NR_renameat - case __NR_renameat: /* Just oldname. TODO add newname */ -#endif - case __NR_renameat2: /* Just oldname. TODO add newname */ - case __NR_removexattrat: /* Just pathname, TODO add name */ case __NR_quotactl: - case __NR_setxattrat: /* Just pathname, TODO add list */ case __NR_syslog: - case __NR_symlinkat: /* Just oldname. TODO add newname */ case __NR_statx: case __NR_unlinkat: case __NR_utimensat: @@ -935,9 +1000,64 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) case __NR_fanotify_mark: sys_data->user_mask = BIT(4); break; + /* 2 user args, 0 and 1 */ + case __NR_add_key: + case __NR_getxattr: + case __NR_lgetxattr: + case __NR_lremovexattr: +#ifdef __NR_link + case __NR_link: +#endif + case __NR_listxattr: + case __NR_llistxattr: + case __NR_lsetxattr: + case __NR_pivot_root: + case __NR_removexattr: +#ifdef __NR_rename + case __NR_rename: +#endif + case __NR_request_key: + case __NR_setxattr: +#ifdef __NR_symlink + case __NR_symlink: +#endif + sys_data->user_mask = BIT(0) | BIT(1); + break; + /* 2 user args, 0 and 2 */ + case __NR_symlinkat: + sys_data->user_mask = BIT(0) | BIT(2); + break; + /* 2 user args, 1 and 3 */ + case __NR_getxattrat: + case __NR_linkat: + case __NR_listxattrat: + case __NR_move_mount: +#ifdef __NR_renameat + case __NR_renameat: +#endif + case __NR_renameat2: + case __NR_removexattrat: + case __NR_setxattrat: + sys_data->user_mask = BIT(1) | BIT(3); + break; + case __NR_mount: /* Just dev_name and dir_name, TODO add type */ + sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); + break; default: sys_data->user_mask = 0; + return; } + + if (sys_data->user_arg_size < 0) + return; + + /* + * The user_arg_size can only be used when the system call + * is reading only a single address from user space. + */ + mask = sys_data->user_mask; + if (WARN_ON(mask & (mask - 1))) + sys_data->user_arg_size = -1; } static int __init init_syscall_trace(struct trace_event_call *call) @@ -1083,10 +1203,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) bool valid_prog_array; bool mayfault; char *user_ptr; + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; - int user_size; int rctx; int size = 0; + int uargs = 0; /* * Syscall probe called with preemption enabled, but the ring @@ -1112,7 +1233,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, - &size, &user_size) < 0) + &size, user_sizes, &uargs) < 0) return; } @@ -1134,7 +1255,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) - syscall_put_data(sys_data, rec, user_ptr, size, user_size); + syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || From 299ea67e6a2b3d0d4b707f45b8c66d8b4bbbf2c6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:22 -0400 Subject: [PATCH 08/42] tracing: Add a config and syscall_user_buf_size file to limit amount written When a system call that can copy user space addresses into the ring buffer, it can copy up to 511 bytes of data. This can waste precious ring buffer space if the user isn't interested in the output. Add a new file "syscall_user_buf_size" that gets initialized to a new config CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 63. The config also is used to limit how much perf can read from user space. Also lower the max down to 165, as this isn't to record everything that a system call may be passing through to the kernel. 165 is more than enough. The reason for 165 is because adding one for the nul terminating byte, as well as possibly needing to append the "..." string turns it into 170 bytes. As this needs to save up to 3 arguments and 3 * 170 is 510 which fits nicely in 512 bytes (a power of 2). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.260068913@kernel.org Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 8 ++++++ kernel/trace/Kconfig | 14 +++++++++ kernel/trace/trace.c | 52 ++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 ++ kernel/trace/trace_syscalls.c | 50 ++++++++++++++++++-------------- 5 files changed, 105 insertions(+), 22 deletions(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index aef674df3afd..d1f313a5f4ad 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -366,6 +366,14 @@ of ftrace. Here is a list of some of the key files: for each function. The displayed address is the patch-site address and can differ from /proc/kallsyms address. + syscall_user_buf_size: + + Some system call trace events will record the data from a user + space address that one of the parameters point to. The amount of + data per event is limited. This file holds the max number of bytes + that will be recorded into the ring buffer to hold this data. + The max value is currently 165. + dyn_ftrace_total_info: This file is for debugging purposes. The number of functions that diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c79da81e4f..99283b2dcfd6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -575,6 +575,20 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACE_SYSCALL_BUF_SIZE_DEFAULT + int "System call user read max size" + range 0 165 + default 63 + depends on FTRACE_SYSCALLS + help + Some system call trace events will record the data from a user + space address that one of the parameters point to. The amount of + data per event is limited. That limit is set by this config and + this config also affects how much user space data perf can read. + + For a tracing instance, this size may be changed by writing into + its syscall_user_buf_size file. + config TRACER_SNAPSHOT bool "Create a snapshot trace buffer" select TRACER_MAX_TRACE diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 50832411c5c0..2aee9a3088f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6911,6 +6911,43 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out; } +static ssize_t +tracing_syscall_buf_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > SYSCALL_FAULT_USER_MAX) + val = SYSCALL_FAULT_USER_MAX; + + tr->syscall_buf_sz = val; + + *ppos += cnt; + + return cnt; +} + static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -8043,6 +8080,14 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_syscall_buf_fops = { + .open = tracing_open_generic_tr, + .read = tracing_syscall_buf_read, + .write = tracing_syscall_buf_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, @@ -10145,6 +10190,8 @@ trace_array_create_systems(const char *name, const char *systems, raw_spin_lock_init(&tr->start_lock); + tr->syscall_buf_sz = global_trace.syscall_buf_sz; + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&tr->snapshot_trigger_lock); @@ -10461,6 +10508,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &buffer_subbuf_size_fops); + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_syscall_buf_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE @@ -11386,6 +11436,8 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; + INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8439fe3058cc..d5cb4bc6cd2e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -131,6 +131,8 @@ enum trace_type { #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 +#define SYSCALL_FAULT_USER_MAX 165 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -430,6 +432,7 @@ struct trace_array { int function_enabled; #endif int no_filter_buffering_ref; + unsigned int syscall_buf_sz; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 3eafe1b8f53e..a2de6364777a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -390,21 +390,19 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) /* * Create a per CPU temporary buffer to copy user space pointers into. * + * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. + * (defined in kernel/trace/trace.h) + + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the + * nul terminating byte and possibly appended EXTRA (4 bytes). + * * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use - * to copy memory from user space addresses into. - * - * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space. - * - * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer. - * It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it - * needs to append the EXTRA or not. - * - * This only allows up to 3 args from system calls. + * to copy memory from user space addresses into that will hold + * 3 args as only 3 args are allowed to be copied from system calls. */ -#define SYSCALL_FAULT_BUF_SZ 512 -#define SYSCALL_FAULT_ARG_SZ 168 -#define SYSCALL_FAULT_USER_MAX 128 +#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) #define SYSCALL_FAULT_MAX_CNT 3 +#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) /* Use the tracing per CPU buffer infrastructure to copy from user space */ struct syscall_user_buffer { @@ -498,7 +496,8 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr, return 0; } -static char *sys_fault_user(struct syscall_metadata *sys_data, +static char *sys_fault_user(unsigned int buf_size, + struct syscall_metadata *sys_data, struct syscall_user_buffer *sbuf, unsigned long *args, unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) @@ -548,6 +547,10 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, data_size[i] = -1; /* Denotes no pointer */ } + /* A zero size means do not even try */ + if (!buf_size) + return NULL; + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, syscall_copy, &sargs); if (!buffer) @@ -568,19 +571,20 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, buf[x] = '.'; } + size = min(buf_size, SYSCALL_FAULT_USER_MAX); + /* * If the text was truncated due to our max limit, * add "..." to the string. */ - if (ret > SYSCALL_FAULT_USER_MAX) { - strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA, - sizeof(EXTRA)); - ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA); + if (ret > size) { + strscpy(buf + size, EXTRA, sizeof(EXTRA)); + ret = size + sizeof(EXTRA); } else { buf[ret++] = '\0'; } } else { - ret = min(ret, SYSCALL_FAULT_USER_MAX); + ret = min((unsigned int)ret, buf_size); } data_size[i] = ret; } @@ -590,7 +594,8 @@ static char *sys_fault_user(struct syscall_metadata *sys_data, static int syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, - char **buffer, int *size, int *user_sizes, int *uargs) + char **buffer, int *size, int *user_sizes, int *uargs, + int buf_size) { struct syscall_user_buffer *sbuf; int i; @@ -600,7 +605,7 @@ syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, if (!sbuf) return -1; - *buffer = sys_fault_user(sys_data, sbuf, args, user_sizes); + *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); /* * user_size is the amount of data to append. * Need to add 4 for the meta field that points to @@ -705,7 +710,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, - &size, user_sizes, &uargs) < 0) + &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) return; } @@ -1204,6 +1209,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) bool mayfault; char *user_ptr; int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; + int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; int syscall_nr; int rctx; int size = 0; @@ -1233,7 +1239,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, - &size, user_sizes, &uargs) < 0) + &size, user_sizes, &uargs, buf_size) < 0) return; } From e77ad6da90aed0c76cfaff76012b07b9ee4edf44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:23 -0400 Subject: [PATCH 09/42] tracing: Show printable characters in syscall arrays When displaying the contents of the user space data passed to the kernel, instead of just showing the array values, also print any printable content. Instead of just: bash-1113 [003] ..... 3433.290654: sys_write(fd: 2, buf: 0x555a8deeddb0 (72:6f:6f:74:40:64:65:62:69:61:6e:2d:78:38:36:2d:36:34:3a:7e:23:20), count: 0x16) Display: bash-1113 [003] ..... 3433.290654: sys_write(fd: 2, buf: 0x555a8deeddb0 (72:6f:6f:74:40:64:65:62:69:61:6e:2d:78:38:36:2d:36:34:3a:7e:23:20) "root@debian-x86-64:~# ", count: 0x16) This only affects tracing and does not affect perf, as this only updates the output from the kernel. The output from perf is via user space. This may change by an update to libtraceevent that will then update perf to have this as well. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.429422865@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a2de6364777a..2d1307f13e13 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -155,6 +155,8 @@ print_syscall_enter(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + bool printable = false; + char *str; if (trace_seq_has_overflowed(s)) goto end; @@ -193,8 +195,11 @@ print_syscall_enter(struct trace_iterator *iter, int flags, val = trace->args[entry->user_arg_size]; + str = ptr; trace_seq_puts(s, " ("); for (int x = 0; x < len; x++, ptr++) { + if (isascii(*ptr) && isprint(*ptr)) + printable = true; if (x) trace_seq_putc(s, ':'); trace_seq_printf(s, "%02x", *ptr); @@ -203,6 +208,22 @@ print_syscall_enter(struct trace_iterator *iter, int flags, trace_seq_printf(s, ", %s", EXTRA); trace_seq_putc(s, ')'); + + /* If nothing is printable, don't bother printing anything */ + if (!printable) + continue; + + trace_seq_puts(s, " \""); + for (int x = 0; x < len; x++) { + if (isascii(str[x]) && isprint(str[x])) + trace_seq_putc(s, str[x]); + else + trace_seq_putc(s, '.'); + } + if (len < val) + trace_seq_printf(s, "\"%s", EXTRA); + else + trace_seq_putc(s, '"'); } trace_seq_putc(s, ')'); From 32e0f607ac6a2bb5d144540897535fd01be77586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:24 -0400 Subject: [PATCH 10/42] tracing: Add trace_seq_pop() and seq_buf_pop() In order to allow an interface to remove an added character from the trace_seq and seq_buf descriptors, add helper functions trace_seq_pop() and seq_buf_pop(). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.594898736@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 17 +++++++++++++++++ include/linux/trace_seq.h | 13 +++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..9f2839e73f8a 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num) } } +/** + * seq_buf_pop - pop off the last written character + * @s: the seq_buf handle + * + * Removes the last written character to the seq_buf @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int seq_buf_pop(struct seq_buf *s) +{ + if (!s->len) + return -1; + + s->len--; + return (unsigned int)s->buffer[s->len]; +} + extern __printf(2, 3) int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); extern __printf(2, 0) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 557780fe1c77..4a0b8c172d27 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s) return s->full || seq_buf_has_overflowed(&s->seq); } +/** + * trace_seq_pop - pop off the last written character + * @s: trace sequence descriptor + * + * Removes the last written character to the trace_seq @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int trace_seq_pop(struct trace_seq *s) +{ + return seq_buf_pop(&s->seq); +} + /* * Currently only defined when tracing is enabled. */ From 64b627c8da9ab8ce9700ba8cba844907abd1f55a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:25 -0400 Subject: [PATCH 11/42] tracing: Add parsing of flags to the sys_enter_openat trace event Add some logic to give the openat system call trace event a bit more human readable information: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7f0053dc121c "/etc/ld.so.cache", flags: O_RDONLY|O_CLOEXEC, mode: 0000 The above is output from "perf script" and now shows the flags used by the openat system call. Since the output from tracing is in the kernel, it can also remove the mode field when not used (when flags does not contain O_CREATE|O_TMPFILE) touch-1185 [002] ...1. 1291.690154: sys_openat(dfd: 4294967196, filename: 139785545139344 "/usr/lib/locale/locale-archive", flags: O_RDONLY|O_CLOEXEC) touch-1185 [002] ...1. 1291.690504: sys_openat(dfd: 18446744073709551516, filename: 140733603151330 "/tmp/x", flags: O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, mode: 0666) As system calls have a fixed ABI, their trace events can be extended. This currently only updates the openat system call, but others may be extended in the future. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.763161484@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 192 ++++++++++++++++++++++++++++++++-- 1 file changed, 182 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 2d1307f13e13..47d9771e8f7c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -127,6 +127,116 @@ const char *get_syscall_name(int syscall) /* Added to user strings or arrays when max limit is reached */ #define EXTRA "..." +static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, + struct syscall_metadata *entry, + int *offset_p, int *len_p, unsigned char **ptr_p) +{ + unsigned char *ptr; + int offset = *offset_p; + int val; + + /* This arg points to a user space string */ + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; + val = *(int *)ptr; + + /* The value is a dynamic string (len << 16 | offset) */ + ptr = (void *)trace + (val & 0xffff); + *len_p = val >> 16; + offset += 4; + + *ptr_p = ptr; + *offset_p = offset; +} + +static enum print_line_t +sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, + struct trace_seq *s, struct trace_event *event) +{ + unsigned char *ptr; + int offset = 0; + int bits, len; + bool done = false; + static const struct trace_print_flags __flags[] = + { + { O_TMPFILE, "O_TMPFILE" }, + { O_WRONLY, "O_WRONLY" }, + { O_RDWR, "O_RDWR" }, + { O_CREAT, "O_CREAT" }, + { O_EXCL, "O_EXCL" }, + { O_NOCTTY, "O_NOCTTY" }, + { O_TRUNC, "O_TRUNC" }, + { O_APPEND, "O_APPEND" }, + { O_NONBLOCK, "O_NONBLOCK" }, + { O_DSYNC, "O_DSYNC" }, + { O_DIRECT, "O_DIRECT" }, + { O_LARGEFILE, "O_LARGEFILE" }, + { O_DIRECTORY, "O_DIRECTORY" }, + { O_NOFOLLOW, "O_NOFOLLOW" }, + { O_NOATIME, "O_NOATIME" }, + { O_CLOEXEC, "O_CLOEXEC" }, + { -1, NULL } + }; + + trace_seq_printf(s, "%s(", entry->name); + + for (int i = 0; !done && i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + if (i) + trace_seq_puts(s, ", "); + + switch (i) { + case 2: + bits = trace->args[2]; + + trace_seq_puts(s, "flags: "); + + /* No need to show mode when not creating the file */ + if (!(bits & (O_CREAT|O_TMPFILE))) + done = true; + + if (!(bits & O_ACCMODE)) { + if (!bits) { + trace_seq_puts(s, "O_RDONLY"); + continue; + } + trace_seq_puts(s, "O_RDONLY|"); + } + + trace_print_flags_seq(s, "|", bits, __flags); + /* + * trace_print_flags_seq() adds a '\0' to the + * buffer, but this needs to append more to the seq. + */ + if (!trace_seq_has_overflowed(s)) + trace_seq_pop(s); + + continue; + case 3: + trace_seq_printf(s, "%s: 0%03o", entry->args[i], + (unsigned int)trace->args[i]); + continue; + } + + trace_seq_printf(s, "%s: %lu", entry->args[i], + trace->args[i]); + + if (!(BIT(i) & entry->user_mask)) + continue; + + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); + trace_seq_printf(s, " \"%.*s\"", len, ptr); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -152,6 +262,15 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } + switch (entry->syscall_nr) { + case __NR_openat: + if (!tr || !(tr->trace_flags & TRACE_ITER_VERBOSE)) + return sys_enter_openat_print(trace, entry, s, event); + break; + default: + break; + } + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { @@ -179,14 +298,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, if (!(BIT(i) & entry->user_mask)) continue; - /* This arg points to a user space string */ - ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; - val = *(int *)ptr; - - /* The value is a dynamic string (len << 16 | offset) */ - ptr = (void *)ent + (val & 0xffff); - len = val >> 16; - offset += 4; + get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); if (entry->user_arg_size < 0 || entry->user_arg_is_str) { trace_seq_printf(s, " \"%.*s\"", len, ptr); @@ -269,6 +381,62 @@ print_syscall_exit(struct trace_iterator *iter, int flags, .size = sizeof(_type), .align = __alignof__(_type), \ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + +static int __init +sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, + "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->dfd)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->filename)),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __get_str(__filename_val),"); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + " REC->flags ? __print_flags(REC->flags, \"|\", "); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); + pos += snprintf(buf + pos, LEN_OR_ZERO, + "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " ((unsigned long)(REC->mode))"); + return pos; +} + static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { @@ -276,8 +444,12 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) int i; int pos = 0; - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) + switch (entry->syscall_nr) { + case __NR_openat: + return sys_enter_openat_print_fmt(entry, buf, len); + default: + break; + } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { From b6e5d971fc5cf35cb64eceb2b43dbd5e4572d640 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:26 -0400 Subject: [PATCH 12/42] tracing: Check for printable characters when printing field dyn strings When the "fields" option is enabled, it prints each trace event field based on its type. But a dynamic array and a dynamic string can both have a "char *" type. Printing it as a string can cause escape characters to be printed and mess up the output of the trace. For dynamic strings, test if there are any non-printable characters, and if so, print both the string with the non printable characters as '.', and the print the hex value of the array. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.929243047@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 97db0b0ccf3e..718b255b6fd8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -950,7 +950,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int offset; int len; int ret; + int i; void *pos; + char *str; list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); @@ -977,8 +979,29 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c trace_seq_puts(&iter->seq, ""); break; } - pos = (void *)iter->ent + offset; - trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + str = (char *)iter->ent + offset; + /* Check if there's any non printable strings */ + for (i = 0; i < len; i++) { + if (str[i] && !(isascii(str[i]) && isprint(str[i]))) + break; + } + if (i < len) { + for (i = 0; i < len; i++) { + if (isascii(str[i]) && isprint(str[i])) + trace_seq_putc(&iter->seq, str[i]); + else + trace_seq_putc(&iter->seq, '.'); + } + trace_seq_puts(&iter->seq, " ("); + for (i = 0; i < len; i++) { + if (i) + trace_seq_putc(&iter->seq, ':'); + trace_seq_printf(&iter->seq, "%02x", str[i]); + } + trace_seq_putc(&iter->seq, ')'); + } else { + trace_seq_printf(&iter->seq, "%.*s", len, str); + } break; case FILTER_PTR_STRING: if (!iter->fmt_size) From 25bd47a592751eba6ed337e6293dc69f8aa2452f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:27 -0400 Subject: [PATCH 13/42] tracing: Have persistent ring buffer print syscalls normally The persistent ring buffer from a previous boot has to be careful printing events as the print formats of random events can have pointers to strings and such that are not available. Ftrace static events (like the function tracer event) are stable and are printed normally. System call event formats are also stable. Allow them to be printed normally as well: Instead of: <...>-1 [005] ...1. 57.240405: sys_enter_waitid: __syscall_nr=0xf7 (247) which=0x1 (1) upid=0x499 (1177) infop=0x7ffd5294d690 (140725988939408) options=0x5 (5) ru=0x0 (0) <...>-1 [005] ...1. 57.240433: sys_exit_waitid: __syscall_nr=0xf7 (247) ret=0x0 (0) <...>-1 [005] ...1. 57.240437: sys_enter_rt_sigprocmask: __syscall_nr=0xe (14) how=0x2 (2) nset=0x7ffd5294d7c0 (140725988939712) oset=0x0 (0) sigsetsize=0x8 (8) <...>-1 [005] ...1. 57.240438: sys_exit_rt_sigprocmask: __syscall_nr=0xe (14) ret=0x0 (0) <...>-1 [005] ...1. 57.240442: sys_enter_close: __syscall_nr=0x3 (3) fd=0x4 (4) <...>-1 [005] ...1. 57.240463: sys_exit_close: __syscall_nr=0x3 (3) ret=0x0 (0) <...>-1 [005] ...1. 57.240485: sys_enter_openat: __syscall_nr=0x101 (257) dfd=0xffffffffffdfff9c (-2097252) filename=(0xffff8b81639ca01c) flags=0x80000 (524288) mode=0x0 (0) __filename_val=/run/systemd/reboot-param <...>-1 [005] ...1. 57.240555: sys_exit_openat: __syscall_nr=0x101 (257) ret=0xffffffffffdffffe (-2097154) <...>-1 [005] ...1. 57.240571: sys_enter_openat: __syscall_nr=0x101 (257) dfd=0xffffffffffdfff9c (-2097252) filename=(0xffff8b81639ca01c) flags=0x80000 (524288) mode=0x0 (0) __filename_val=/run/systemd/reboot-param <...>-1 [005] ...1. 57.240620: sys_exit_openat: __syscall_nr=0x101 (257) ret=0xffffffffffdffffe (-2097154) <...>-1 [005] ...1. 57.240629: sys_enter_writev: __syscall_nr=0x14 (20) fd=0x3 (3) vec=0x7ffd5294ce50 (140725988937296) vlen=0x7 (7) <...>-1 [005] ...1. 57.242281: sys_exit_writev: __syscall_nr=0x14 (20) ret=0x24 (36) <...>-1 [005] ...1. 57.242286: sys_enter_reboot: __syscall_nr=0xa9 (169) magic1=0xfee1dead (4276215469) magic2=0x28121969 (672274793) cmd=0x1234567 (19088743) arg=0x0 (0) Have: <...>-1 [000] ...1. 91.446011: sys_waitid(which: 1, upid: 0x4d2, infop: 0x7ffdccdadfd0, options: 5, ru: 0) <...>-1 [000] ...1. 91.446042: sys_waitid -> 0x0 <...>-1 [000] ...1. 91.446045: sys_rt_sigprocmask(how: 2, nset: 0x7ffdccdae100, oset: 0, sigsetsize: 8) <...>-1 [000] ...1. 91.446047: sys_rt_sigprocmask -> 0x0 <...>-1 [000] ...1. 91.446051: sys_close(fd: 4) <...>-1 [000] ...1. 91.446073: sys_close -> 0x0 <...>-1 [000] ...1. 91.446095: sys_openat(dfd: 18446744073709551516, filename: 139732544945794 "/run/systemd/reboot-param", flags: O_RDONLY|O_CLOEXEC) <...>-1 [000] ...1. 91.446165: sys_openat -> 0xfffffffffffffffe <...>-1 [000] ...1. 91.446182: sys_openat(dfd: 18446744073709551516, filename: 139732544945794 "/run/systemd/reboot-param", flags: O_RDONLY|O_CLOEXEC) <...>-1 [000] ...1. 91.446233: sys_openat -> 0xfffffffffffffffe <...>-1 [000] ...1. 91.446242: sys_writev(fd: 3, vec: 0x7ffdccdad790, vlen: 7) <...>-1 [000] ...1. 91.447877: sys_writev -> 0x24 <...>-1 [000] ...1. 91.447883: sys_reboot(magic1: 0xfee1dead, magic2: 0x28121969, cmd: 0x1234567, arg: 0) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231149.097404581@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2aee9a3088f4..a765792d3428 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -4219,6 +4220,22 @@ static void test_cpu_buff_start(struct trace_iterator *iter) iter->cpu); } +#ifdef CONFIG_FTRACE_SYSCALLS +static bool is_syscall_event(struct trace_event *event) +{ + return (event->funcs == &enter_syscall_print_funcs) || + (event->funcs == &exit_syscall_print_funcs); + +} +#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +#else +static inline bool is_syscall_event(struct trace_event *event) +{ + return false; +} +#define syscall_buf_size 0 +#endif /* CONFIG_FTRACE_SYSCALLS */ + static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; @@ -4251,10 +4268,12 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta) && - event->type > __TRACE_LAST_TYPE) + if ((tr->text_delta)) { + /* ftrace and system call events are still OK */ + if ((event->type > __TRACE_LAST_TYPE) && + !is_syscall_event(event)) return print_event_fields(iter, event); - + } return event->funcs->trace(iter, sym_flags, event); } @@ -11436,7 +11455,7 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; - global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; + global_trace.syscall_buf_sz = syscall_buf_size; INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); From bbec8e28cac5928c20052c489cb2e345e6bd4271 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 31 Oct 2025 11:46:11 +0900 Subject: [PATCH 14/42] tracing: Allow tracer to add more than 32 options Since enum trace_iterator_flags is 32bit, the max number of the option flags is limited to 32 and it is fully used now. To add a new option, we need to expand it. So replace the TRACE_ITER_##flag with TRACE_ITER(flag) macro which is 64bit bitmask. Link: https://lore.kernel.org/all/176187877103.994619.166076000668757232.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/blktrace.c | 6 +- kernel/trace/trace.c | 151 ++++++++++++++------------- kernel/trace/trace.h | 29 +++-- kernel/trace/trace_events.c | 4 +- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_fprobe.c | 6 +- kernel/trace/trace_functions_graph.c | 18 ++-- kernel/trace/trace_irqsoff.c | 30 +++--- kernel/trace/trace_kdb.c | 2 +- kernel/trace/trace_kprobe.c | 6 +- kernel/trace/trace_output.c | 18 ++-- kernel/trace/trace_output.h | 11 ++ kernel/trace/trace_sched_wakeup.c | 24 ++--- kernel/trace/trace_syscalls.c | 2 +- 14 files changed, 159 insertions(+), 150 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6941145b5058..e21176f396d5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1452,7 +1452,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; - long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER(VERBOSE)); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; @@ -1517,9 +1517,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER(CONTEXT_INFO); else - tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER(CONTEXT_INFO); } return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..14e8703a6a53 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -513,21 +513,21 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ - TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ - TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ - TRACE_ITER_COPY_MARKER) + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ + TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* trace_options that are only supported by global_trace */ -#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ - TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD)) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ - TRACE_ITER_COPY_MARKER) + (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) /* * The global_trace is the descriptor that holds the top-level tracing @@ -558,9 +558,9 @@ static void update_printk_trace(struct trace_array *tr) if (printk_trace == tr) return; - printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK); printk_trace = tr; - tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; + tr->trace_flags |= TRACE_ITER(TRACE_PRINTK); } /* Returns true if the status of tr changed */ @@ -573,7 +573,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return false; list_add_rcu(&tr->marker_list, &marker_copies); - tr->trace_flags |= TRACE_ITER_COPY_MARKER; + tr->trace_flags |= TRACE_ITER(COPY_MARKER); return true; } @@ -581,7 +581,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return false; list_del_init(&tr->marker_list); - tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; + tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); return true; } @@ -1139,7 +1139,7 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx; int alloc; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) @@ -1205,7 +1205,7 @@ int __trace_bputs(unsigned long ip, const char *str) if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -3078,7 +3078,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) return; __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); @@ -3139,7 +3139,7 @@ ftrace_trace_userstack(struct trace_array *tr, struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE))) return; /* @@ -3484,7 +3484,7 @@ int trace_array_printk(struct trace_array *tr, if (tr == &global_trace) return 0; - if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3521,7 +3521,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, int ret; va_list ap; - if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -3791,7 +3791,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt) if (WARN_ON_ONCE(!fmt)) return fmt; - if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR)) return fmt; p = fmt; @@ -4113,7 +4113,7 @@ static void print_event_info(struct array_buffer *buf, struct seq_file *m) static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); print_event_info(buf, m); @@ -4124,7 +4124,7 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { - bool tgid = flags & TRACE_ITER_RECORD_TGID; + bool tgid = flags & TRACE_ITER(RECORD_TGID); static const char space[] = " "; int prec = tgid ? 12 : 2; @@ -4197,7 +4197,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER(ANNOTATE))) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) @@ -4233,7 +4233,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -4244,7 +4244,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return TRACE_TYPE_PARTIAL_LINE; if (event) { - if (tr->trace_flags & TRACE_ITER_FIELDS) + if (tr->trace_flags & TRACE_ITER(FIELDS)) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not @@ -4272,7 +4272,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -4298,7 +4298,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -4327,7 +4327,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -4398,27 +4398,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) } if (iter->ent->type == TRACE_BPUTS && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_printk_msg_only(iter); - if (trace_flags & TRACE_ITER_BIN) + if (trace_flags & TRACE_ITER(BIN)) return print_bin_fmt(iter); - if (trace_flags & TRACE_ITER_HEX) + if (trace_flags & TRACE_ITER(HEX)) return print_hex_fmt(iter); - if (trace_flags & TRACE_ITER_RAW) + if (trace_flags & TRACE_ITER(RAW)) return print_raw_fmt(iter); return print_trace_fmt(iter); @@ -4436,7 +4436,7 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } @@ -4446,7 +4446,7 @@ void trace_default_header(struct seq_file *m) struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(trace_flags & TRACE_ITER(CONTEXT_INFO))) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { @@ -4454,11 +4454,11 @@ void trace_default_header(struct seq_file *m) if (trace_empty(iter)) return; print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (!(trace_flags & TRACE_ITER(VERBOSE))) { + if (trace_flags & TRACE_ITER(IRQ_INFO)) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else @@ -4682,7 +4682,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ - if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { @@ -4876,7 +4876,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -5148,7 +5148,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { - if (tr->trace_flags & (1 << i)) + if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); @@ -5201,20 +5201,20 @@ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) } /* Some tracers require overwrite to stay enabled */ -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) { - if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set) return -1; return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { - if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK) || - (mask == TRACE_ITER_COPY_MARKER)) + if ((mask == TRACE_ITER(RECORD_TGID)) || + (mask == TRACE_ITER(RECORD_CMD)) || + (mask == TRACE_ITER(TRACE_PRINTK)) || + (mask == TRACE_ITER(COPY_MARKER))) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5226,7 +5226,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; - if (mask == TRACE_ITER_TRACE_PRINTK) { + if (mask == TRACE_ITER(TRACE_PRINTK)) { if (enabled) { update_printk_trace(tr); } else { @@ -5245,7 +5245,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) } } - if (mask == TRACE_ITER_COPY_MARKER) + if (mask == TRACE_ITER(COPY_MARKER)) update_marker_trace(tr, enabled); if (enabled) @@ -5253,33 +5253,33 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) else tr->trace_flags &= ~mask; - if (mask == TRACE_ITER_RECORD_CMD) + if (mask == TRACE_ITER(RECORD_CMD)) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_RECORD_TGID) { + if (mask == TRACE_ITER(RECORD_TGID)) { if (trace_alloc_tgid_map() < 0) { - tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); return -ENOMEM; } trace_event_enable_tgid_record(enabled); } - if (mask == TRACE_ITER_EVENT_FORK) + if (mask == TRACE_ITER(EVENT_FORK)) trace_event_follow_fork(tr, enabled); - if (mask == TRACE_ITER_FUNC_FORK) + if (mask == TRACE_ITER(FUNC_FORK)) ftrace_pid_follow_fork(tr, enabled); - if (mask == TRACE_ITER_OVERWRITE) { + if (mask == TRACE_ITER(OVERWRITE)) { ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } - if (mask == TRACE_ITER_PRINTK) { + if (mask == TRACE_ITER(PRINTK)) { trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); } @@ -5311,7 +5311,7 @@ int trace_set_options(struct trace_array *tr, char *option) if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else - ret = set_tracer_flag(tr, 1 << ret, !neg); + ret = set_tracer_flag(tr, 1ULL << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -6532,7 +6532,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -6593,7 +6593,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; - if (tr->trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER(BLOCK)) /* * Always select as readable when in blocking mode */ @@ -7145,7 +7145,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE)) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -7395,7 +7395,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; if ((ssize_t)cnt < 0) @@ -7479,7 +7479,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) return -EINVAL; /* The marker must at least have a tag id */ @@ -9305,7 +9305,7 @@ trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, get_tr_index(tr_index, &tr, &index); - if (tr->trace_flags & (1 << index)) + if (tr->trace_flags & (1ULL << index)) buf = "1\n"; else buf = "0\n"; @@ -9334,7 +9334,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = set_tracer_flag(tr, 1 << index, val); + ret = set_tracer_flag(tr, 1ULL << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -9498,8 +9498,9 @@ static void create_trace_options_dir(struct trace_array *tr) for (i = 0; trace_options[i]; i++) { if (top_level || - !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) { create_trace_option_core_file(tr, trace_options[i], i); + } } } @@ -9820,7 +9821,7 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size struct trace_scratch *tscratch; unsigned int scratch_size = 0; - rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -10183,7 +10184,7 @@ static int __remove_instance(struct trace_array *tr) /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { if ((1 << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1 << i, 0); + set_tracer_flag(tr, 1ULL << i, 0); } if (printk_trace == tr) @@ -10773,10 +10774,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* While dumping, do not allow the buffer to be enable */ tracer_tracing_disable(tr); - old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ); /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 85eabb454bee..8c99136619bf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -216,7 +216,7 @@ struct array_buffer { int cpu; }; -#define TRACE_FLAGS_MAX_SIZE 32 +#define TRACE_FLAGS_MAX_SIZE 64 struct trace_options { struct tracer *tracer; @@ -390,7 +390,7 @@ struct trace_array { int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; - unsigned int trace_flags; + u64 trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; @@ -631,7 +631,7 @@ struct tracer { u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, - u32 mask, int set); + u64 mask, int set); struct tracer *next; struct tracer_flags *flags; int enabled; @@ -1345,11 +1345,11 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), -# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER(FUNCTION) #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL -# define TRACE_ITER_FUNC_FORK 0UL +# define TRACE_ITER_FUNC_FORK_BIT -1 #endif #ifdef CONFIG_STACKTRACE @@ -1391,7 +1391,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ - C(COPY_MARKER, "copy_trace_marker"),\ + C(COPY_MARKER, "copy_trace_marker"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ @@ -1413,20 +1413,17 @@ enum trace_iterator_bits { }; /* - * By redefining C, we can make TRACE_FLAGS a list of masks that - * use the bits as defined above. + * And use TRACE_ITER(flag) to define the bit masks. */ -#undef C -#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) - -enum trace_iterator_flags { TRACE_FLAGS }; +#define TRACE_ITER(flag) \ + (TRACE_ITER_##flag##_BIT < 0 ? 0 : 1ULL << (TRACE_ITER_##flag##_BIT)) /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + (TRACE_ITER(PRINT_PARENT)|TRACE_ITER(SYM_OFFSET)|TRACE_ITER(SYM_ADDR)) extern struct tracer nop_trace; @@ -1435,7 +1432,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (tr->trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER(BRANCH)) return enable_branch_tracing(tr); return 0; } @@ -2064,8 +2061,8 @@ extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set); +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e00da4182deb..9b07ad9eb284 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -845,13 +845,13 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f24ee61f8884..2f19bbe73d27 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -359,7 +359,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, fmt = synth_field_fmt(se->fields[i]->type); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", fmt); snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index ad9d6347b5fa..53e2325800e0 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -631,7 +631,7 @@ print_fentry_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -661,12 +661,12 @@ print_fexit_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7f4b9a47a71..fe9607edc8f9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -703,7 +703,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -723,7 +723,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); } @@ -777,7 +777,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* No real adata, just filling the column with spaces */ @@ -818,7 +818,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e trace_seq_puts(s, " /*"); trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + seq_print_ip_sym_offset(s, entry->graph_ent.retaddr, trace_flags); if (comment) trace_seq_puts(s, " */"); @@ -1054,7 +1054,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; /* Absolute time */ @@ -1076,7 +1076,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) print_graph_lat_fmt(s, ent); return; @@ -1495,7 +1495,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) static void __print_graph_headers_flags(struct trace_array *tr, struct seq_file *s, u32 flags) { - int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER(LATENCY_FMT); if (lat) print_lat_header(s, flags); @@ -1543,10 +1543,10 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; - if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER(CONTEXT_INFO))) return; - if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4c45c49b06c8..17673905907c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int irqsoff_display_graph(struct trace_array *tr, int set); -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else static inline int irqsoff_display_graph(struct trace_array *tr, int set) { @@ -485,8 +485,8 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -515,7 +515,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -536,7 +536,7 @@ static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set } #endif /* CONFIG_FUNCTION_TRACER */ -static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -544,7 +544,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return irqsoff_display_graph(tr, set); #endif @@ -582,10 +582,10 @@ static int __irqsoff_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); /* without pause, we will produce garbage if another latency occurs */ - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -605,15 +605,15 @@ static int __irqsoff_tracer_init(struct trace_array *tr) static void __irqsoff_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); + int pause_flag = save_flags & TRACE_ITER(PAUSE_ON_TRACE); stop_irqsoff_tracer(tr, is_graph(tr)); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); - set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(PAUSE_ON_TRACE), pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 896ff78b8349..b30795f34079 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -31,7 +31,7 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); kdb_printf("Dumping ftrace buffer:\n"); if (skip_entries) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ee8171b19bee..9953506370a5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1584,7 +1584,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ip, flags)) goto out; trace_seq_putc(s, ')'); @@ -1614,12 +1614,12 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_offset(s, field->ret_ip, flags)) goto out; trace_seq_puts(s, " <- "); - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + if (!seq_print_ip_sym_no_offset(s, field->func, flags)) goto out; trace_seq_putc(s, ')'); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 97db0b0ccf3e..a2403d8f7c39 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -420,7 +420,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } mmap_read_unlock(mm); } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + if (ret && ((sym_flags & TRACE_ITER(SYM_ADDR)) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); return !trace_seq_has_overflowed(s); } @@ -433,9 +433,9 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER(SYM_OFFSET)); - if (sym_flags & TRACE_ITER_SYM_ADDR) + if (sym_flags & TRACE_ITER(SYM_ADDR)) trace_seq_printf(s, " <" IP_FMT ">", ip); out: @@ -569,7 +569,7 @@ static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { struct trace_array *tr = iter->tr; - unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; + unsigned long verbose = tr->trace_flags & TRACE_ITER(VERBOSE); unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -636,7 +636,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); - if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) @@ -647,7 +647,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "[%03d] ", iter->cpu); - if (tr->trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER(IRQ_INFO)) trace_print_lat_fmt(s, entry); trace_print_time(s, iter, iter->ts); @@ -661,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_entry *entry, *next_entry; struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER(VERBOSE)); u64 next_ts; next_entry = trace_find_next_entry(iter, NULL, &next_ts); @@ -1127,7 +1127,7 @@ static void print_fn_trace(struct trace_seq *s, unsigned long ip, if (args) print_function_args(s, args, ip); - if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + if ((flags & TRACE_ITER(PRINT_PARENT)) && parent_ip) { trace_seq_puts(s, " <-"); seq_print_ip_sym(s, parent_ip, flags); } @@ -1417,7 +1417,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "\n"); - if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + if (tr->trace_flags & TRACE_ITER(SYM_USEROBJ)) { struct task_struct *task; /* * we do the lookup on the thread group leader, diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2e305364f2a9..99b676733d46 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,17 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +static inline int seq_print_ip_sym_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags | TRACE_ITER(SYM_OFFSET)); +} +static inline int seq_print_ip_sym_no_offset(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags) +{ + return seq_print_ip_sym(s, ip, sym_flags & ~TRACE_ITER(SYM_OFFSET)); +} + extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e3f2e4f56faa..8faa73d3bba1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -41,7 +41,7 @@ static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER(DISPLAY_GRAPH)) #else # define is_graph(tr) false #endif @@ -247,8 +247,8 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + /* 'set' is set if TRACE_ITER(FUNCTION) is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER(FUNCTION)))) return 0; if (graph) @@ -277,7 +277,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { - if (!(mask & TRACE_ITER_FUNCTION)) + if (!(mask & TRACE_ITER(FUNCTION))) return 0; if (set) @@ -324,7 +324,7 @@ __trace_function(struct trace_array *tr, trace_function(tr, ip, parent_ip, trace_ctx, NULL); } -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u64 mask, int set) { struct tracer *tracer = tr->current_trace; @@ -332,7 +332,7 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) return 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) + if (mask & TRACE_ITER(DISPLAY_GRAPH)) return wakeup_display_graph(tr, set); #endif @@ -681,8 +681,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), 1); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), 1); tr->max_latency = 0; wakeup_trace = tr; @@ -725,15 +725,15 @@ static int wakeup_dl_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { - int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; - int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int lat_flag = save_flags & TRACE_ITER(LATENCY_FMT); + int overwrite_flag = save_flags & TRACE_ITER(OVERWRITE); stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER(LATENCY_FMT), lat_flag); + set_tracer_flag(tr, TRACE_ITER(OVERWRITE), overwrite_flag); ftrace_reset_array_ops(tr); wakeup_busy = false; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0f932b22f9ec..e2c679bd7ace 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -157,7 +157,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, trace_seq_puts(s, ", "); /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ From 1149fcf75972f6918aeb05303b1aa1e38e0df6eb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 31 Oct 2025 11:46:20 +0900 Subject: [PATCH 15/42] tracing: Add an option to show symbols in _text+offset for function profiler Function profiler shows the hit count of each function using its symbol name. However, there are some same-name local symbols, which we can not distinguish. To solve this issue, this introduces an option to show the symbols in "_text+OFFSET" format. This can avoid exposing the random shift of KASLR. The functions in modules are shown as "MODNAME+OFFSET" where the offset is from ".text". E.g. for the kernel text symbols, specify vmlinux and the output to addr2line, you can find the actual function and source info; $ addr2line -fie vmlinux _text+3078208 __balance_callbacks kernel/sched/core.c:5064 for modules, specify the module file and .text+OFFSET; $ addr2line -fie samples/trace_events/trace-events-sample.ko .text+8224 do_simple_thread_func samples/trace_events/trace-events-sample.c:23 Link: https://lore.kernel.org/all/176187878064.994619.8878296550240416558.stgit@devnote2/ Suggested-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/ftrace.c | 26 +++++++++++++++++++++++++- kernel/trace/trace.c | 5 +++-- kernel/trace/trace.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..ab601cd9638b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -534,7 +534,9 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { + struct trace_array *tr = trace_get_global_array(); struct ftrace_profile *rec = v; + const char *refsymbol = NULL; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; @@ -554,7 +556,29 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; #endif - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (tr->trace_flags & TRACE_ITER(PROF_TEXT_OFFSET)) { + unsigned long offset; + + if (core_kernel_text(rec->ip)) { + refsymbol = "_text"; + offset = rec->ip - (unsigned long)_text; + } else { + struct module *mod; + + guard(rcu)(); + mod = __module_text_address(rec->ip); + if (mod) { + refsymbol = mod->name; + /* Calculate offset from module's text entry address. */ + offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base; + } + } + if (refsymbol) + snprintf(str, sizeof(str), " %s+%#lx", refsymbol, offset); + } + if (!refsymbol) + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 14e8703a6a53..e5f186daf007 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -522,7 +522,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ - TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD)) + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ + TRACE_ITER(PROF_TEXT_OFFSET)) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ @@ -11291,7 +11292,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_FUNCTION_TRACER /* Used to set module cached ftrace filtering at boot up */ -__init struct trace_array *trace_get_global_array(void) +struct trace_array *trace_get_global_array(void) { return &global_trace; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8c99136619bf..7d8f4bd9facd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1359,6 +1359,14 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, # define STACK_FLAGS #endif +#ifdef CONFIG_FUNCTION_PROFILER +# define PROFILER_FLAGS \ + C(PROF_TEXT_OFFSET, "prof-text-offset"), +#else +# define PROFILER_FLAGS +# define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1 +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -1397,7 +1405,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + PROFILER_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names From a10e6e681864ce8ae7f6957be5577f8d17669db0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 4 Nov 2025 20:53:10 -0500 Subject: [PATCH 16/42] tracing: Hide __NR_utimensat and _NR_mq_timedsend when not defined Some architectures (riscv-32) do not define __NR_utimensat and _NR_mq_timedsend, and fails to build when they are used. Hide them in "ifdef"s. Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20251104205310.00a1db9a@batman.local.home Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511031239.ZigDcWzY-lkp@intel.com/ Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_syscalls.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e07c5a3cc7ab..e96d0063cbcf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1072,7 +1072,9 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) switch (nr) { /* user arg 1 with size arg at 2 */ case __NR_write: +#ifdef __NR_mq_timedsend case __NR_mq_timedsend: +#endif case __NR_pwrite64: sys_data->user_mask = BIT(1); sys_data->user_arg_size = 2; @@ -1186,7 +1188,9 @@ static void check_faultable_syscall(struct trace_event_call *call, int nr) case __NR_syslog: case __NR_statx: case __NR_unlinkat: +#ifdef __NR_utimensat case __NR_utimensat: +#endif sys_data->user_mask = BIT(1); break; /* user arg at position 2 */ From c7bed15ccf238fb2e47938c522a49ddc4a919f7d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2025 11:19:10 -0500 Subject: [PATCH 17/42] tracing: Remove dummy options and flags When a tracer does not define their own flags, dummy options and flags are used so that the values are always valid. There's not that many locations that reference these values so having dummy versions just complicates the code. Remove the dummy values and just check for NULL when appropriate. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251105161935.206093132@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 48 +++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e822db5d9e4..afeaa9a164e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -94,17 +94,6 @@ static bool tracepoint_printk_stop_on_boot __initdata; static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } -}; - -static int -dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return 0; -} - /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event @@ -2356,23 +2345,9 @@ int __init register_tracer(struct tracer *type) } } - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) { - /*allocate a dummy tracer_flags*/ - type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) { - ret = -ENOMEM; - goto out; - } - type->flags->val = 0; - type->flags->opts = dummy_tracer_opt; - } else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - /* store the tracer for __set_tracer_option */ - type->flags->trace = type; + if (type->flags) + type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) @@ -5159,14 +5134,12 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer *trace; u32 tracer_flags; int i; guard(mutex)(&trace_types_lock); - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; - for (i = 0; trace_options[i]; i++) { if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); @@ -5174,6 +5147,13 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) seq_printf(m, "no%s\n", trace_options[i]); } + trace = tr->current_trace; + if (!trace->flags || !trace->flags->opts) + return 0; + + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; + for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); @@ -5189,9 +5169,10 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; - int ret; + int ret = 0; - ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (trace->set_flag) + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -5210,6 +5191,9 @@ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) struct tracer_opt *opts = NULL; int i; + if (!tracer_flags || !tracer_flags->opts) + return 0; + for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; From 5aa0d18df08a87c1d71a39c4a84c5ec63ada67c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2025 11:19:11 -0500 Subject: [PATCH 18/42] tracing: Have add_tracer_options() error pass up to callers The function add_tracer_options() can fail, but currently it is ignored. Pass the status of add_tracer_options() up to adding a new tracer as well as when an instance is created. Have the instance creation fail if the add_tracer_options() fail. Only print a warning for the top level instance, like it does with other failures. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251105161935.375299297@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 55 +++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index afeaa9a164e9..ed929d331e1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2302,7 +2302,7 @@ static inline int do_run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ -static void add_tracer_options(struct trace_array *tr, struct tracer *t); +static int add_tracer_options(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); @@ -2353,9 +2353,14 @@ int __init register_tracer(struct tracer *type) if (ret < 0) goto out; + ret = add_tracer_options(&global_trace, type); + if (ret < 0) { + pr_warn("Failed to create tracer options for %s\n", type->name); + goto out; + } + type->next = trace_types; trace_types = type; - add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); @@ -6221,7 +6226,7 @@ int tracing_update_buffers(struct trace_array *tr) struct trace_option_dentry; -static void +static int create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* @@ -6243,17 +6248,17 @@ static void tracing_set_nop(struct trace_array *tr) static bool tracer_options_updated; -static void add_tracer_options(struct trace_array *tr, struct tracer *t) +static int add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return; + return 0; /* Only create trace option files after update_tracer_options finish */ if (!tracer_options_updated) - return; + return 0; - create_trace_option_files(tr, t); + return create_trace_option_files(tr, t); } int tracing_set_tracer(struct trace_array *tr, const char *buf) @@ -9585,7 +9590,7 @@ create_trace_option_file(struct trace_array *tr, } -static void +static int create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; @@ -9596,24 +9601,24 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) int i; if (!tracer) - return; + return 0; flags = tracer->flags; if (!flags || !flags->opts) - return; + return 0; /* * If this is an instance, only create flags for tracers * the instance may have. */ if (!trace_ok_for_array(tracer, tr)) - return; + return 0; for (i = 0; i < tr->nr_topts; i++) { /* Make sure there's no duplicate flags. */ if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) - return; + return -EINVAL; } opts = flags->opts; @@ -9623,13 +9628,13 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return; + return 0; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); - return; + return -ENOMEM; } tr->topts = tr_topts; @@ -9644,6 +9649,7 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) "Failed to create trace option: %s", opts[cnt].name); } + return 0; } static struct dentry * @@ -10094,15 +10100,18 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } -static void __update_tracer_options(struct trace_array *tr) +static int __update_tracer_options(struct trace_array *tr) { struct tracer *t; + int ret = 0; - for (t = trace_types; t; t = t->next) - add_tracer_options(tr, t); + for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer_options(tr, t); + + return ret; } -static void update_tracer_options(struct trace_array *tr) +static __init void update_tracer_options(struct trace_array *tr) { guard(mutex)(&trace_types_lock); tracer_options_updated = true; @@ -10151,9 +10160,13 @@ static int trace_array_create_dir(struct trace_array *tr) } init_tracer_tracefs(tr, tr->dir); - __update_tracer_options(tr); - - return ret; + ret = __update_tracer_options(tr); + if (ret) { + event_trace_del_tracer(tr); + tracefs_remove(tr->dir); + return ret; + } + return 0; } static struct trace_array * From 9c5053083eeecc7a2cd6c1271a85087e73820ae2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2025 19:33:25 -0500 Subject: [PATCH 19/42] tracing: Exit out immediately after update_marker_trace() The call to update_marker_trace() in set_tracer_flag() performs the update to the tr->trace_flags. There's no reason to perform it again after it is called. Return immediately instead. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251106003501.726406870@kernel.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed929d331e1d..88234b541b09 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5254,8 +5254,11 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) } } - if (mask == TRACE_ITER(COPY_MARKER)) + if (mask == TRACE_ITER(COPY_MARKER)) { update_marker_trace(tr, enabled); + /* update_marker_trace updates the tr->trace_flags */ + return 0; + } if (enabled) tr->trace_flags |= mask; From 3a0d5bc76ff482c6e0c20f66f2b32e5dcf8238fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2025 19:33:26 -0500 Subject: [PATCH 20/42] tracing: Use switch statement instead of ifs in set_tracer_flag() The "mask" passed in to set_trace_flag() has a single bit set. The function then checks if the mask is equal to one of the option masks and performs the appropriate function associated to that option. Instead of having a bunch of "if ()" statement, use a "switch ()" statement instead to make it cleaner and a bit more optimal. No function changes. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251106003501.890298562@kernel.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88234b541b09..0aea9cb84276 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5220,11 +5220,13 @@ int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { - if ((mask == TRACE_ITER(RECORD_TGID)) || - (mask == TRACE_ITER(RECORD_CMD)) || - (mask == TRACE_ITER(TRACE_PRINTK)) || - (mask == TRACE_ITER(COPY_MARKER))) + switch (mask) { + case TRACE_ITER(RECORD_TGID): + case TRACE_ITER(RECORD_CMD): + case TRACE_ITER(TRACE_PRINTK): + case TRACE_ITER(COPY_MARKER): lockdep_assert_held(&event_mutex); + } /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) @@ -5235,7 +5237,8 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; - if (mask == TRACE_ITER(TRACE_PRINTK)) { + switch (mask) { + case TRACE_ITER(TRACE_PRINTK): if (enabled) { update_printk_trace(tr); } else { @@ -5252,9 +5255,9 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) if (printk_trace == tr) update_printk_trace(&global_trace); } - } + break; - if (mask == TRACE_ITER(COPY_MARKER)) { + case TRACE_ITER(COPY_MARKER): update_marker_trace(tr, enabled); /* update_marker_trace updates the tr->trace_flags */ return 0; @@ -5265,10 +5268,12 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) else tr->trace_flags &= ~mask; - if (mask == TRACE_ITER(RECORD_CMD)) + switch (mask) { + case TRACE_ITER(RECORD_CMD): trace_event_enable_cmd_record(enabled); + break; - if (mask == TRACE_ITER(RECORD_TGID)) { + case TRACE_ITER(RECORD_TGID): if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); @@ -5276,24 +5281,27 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) } trace_event_enable_tgid_record(enabled); - } + break; - if (mask == TRACE_ITER(EVENT_FORK)) + case TRACE_ITER(EVENT_FORK): trace_event_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER(FUNC_FORK)) + case TRACE_ITER(FUNC_FORK): ftrace_pid_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER(OVERWRITE)) { + case TRACE_ITER(OVERWRITE): ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif - } + break; - if (mask == TRACE_ITER(PRINTK)) { + case TRACE_ITER(PRINTK): trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); + break; } return 0; From 7157062bb46c8bcfba3a7d77f91fc4795bff1316 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Nov 2025 00:48:20 +0900 Subject: [PATCH 21/42] tracing: Report wrong dynamic event command Report wrong dynamic event type in the command via error_log. ----- # echo "z hoge" > /sys/kernel/tracing/dynamic_events sh: write error: Invalid argument # cat /sys/kernel/tracing/error_log [ 22.977022] dynevent: error: No matching dynamic event type Command: z hoge ^ ----- Cc: Mathieu Desnoyers Link: https://patch.msgid.link/176278970056.343441.10528135217342926645.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_dynevent.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index d06854bd32b3..c4dfbc293bae 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -144,9 +144,16 @@ static int create_dyn_event(const char *raw_command) if (!ret || ret != -ECANCELED) break; } - mutex_unlock(&dyn_event_ops_mutex); - if (ret == -ECANCELED) + if (ret == -ECANCELED) { + static const char *err_msg[] = {"No matching dynamic event type"}; + + /* Wrong dynamic event. Leave an error message. */ + tracing_log_err(NULL, "dynevent", raw_command, err_msg, + 0, 0); ret = -EINVAL; + } + + mutex_unlock(&dyn_event_ops_mutex); return ret; } From 428add559b6923f13acc591913cda3467be98dfd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2025 18:24:07 -0500 Subject: [PATCH 22/42] tracing: Have tracer option be instance specific Tracers can add specify options to modify them. This logic was added before instances were created and the tracer flags were global variables. After instances were created where a tracer may exist in more than one instance, the flags were not updated from being global into instance specific. This causes confusion with these options. For example, the function tracer has an option to enable function arguments: # cd /sys/kernel/tracing # mkdir instances/foo # echo function > instances/foo/current_tracer # echo 1 > options/func-args # echo function > current_tracer # cat trace [..] -0 [005] d..3. 1050.656187: rcu_needs_cpu() <-tick_nohz_next_event -0 [005] d..3. 1050.656188: get_next_timer_interrupt(basej=0x10002dbad, basem=0xf45fd7d300) <-tick_nohz_next_event -0 [005] d..3. 1050.656189: _raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt -0 [005] d..4. 1050.656190: do_raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt -0 [005] d..4. 1050.656191: _raw_spin_lock_nested(lock=0xffff8944bdf5f140, subclass=1) <-__get_next_timer_interrupt # cat instances/foo/options/func-args 1 # cat instances/foo/trace [..] kworker/4:1-88 [004] ...1. 298.127735: next_zone <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127736: first_online_pgdat <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127738: next_online_pgdat <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127739: fold_diff <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127741: round_jiffies_relative <-vmstat_update [..] The above shows that setting "func-args" in the top level instance also set it in the instance "foo", but since the interface of the trace flags are per instance, the update didn't take affect in the "foo" instance. Update the infrastructure to allow tracers to add a "default_flags" field in the tracer structure that can be set instead of "flags" which will make the flags per instance. If a tracer needs to keep the flags global (like blktrace), keeping the "flags" field set will keep the old behavior. This does not update function or the function graph tracers. That will be handled later. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251111232429.305317942@kernel.org Fixes: f20a580627f43 ("ftrace: Allow instances to use function tracing") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 257 ++++++++++++++++++++++++++++++------------- kernel/trace/trace.h | 3 + 2 files changed, 186 insertions(+), 74 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aea9cb84276..9268489d2ce8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -94,6 +94,13 @@ static bool tracepoint_printk_stop_on_boot __initdata; static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); +/* Store tracers and their flags per instance */ +struct tracers { + struct list_head list; + struct tracer *tracer; + struct tracer_flags *flags; +}; + /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event @@ -2164,6 +2171,7 @@ static int save_selftest(struct tracer *type) static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; + struct tracer_flags *saved_flags = tr->current_trace_flags; struct tracer *saved_tracer = tr->current_trace; int ret; @@ -2194,6 +2202,7 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; + tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { @@ -2210,6 +2219,7 @@ static int run_tracer_selftest(struct tracer *type) ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; + tr->current_trace_flags = saved_flags; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -2302,10 +2312,23 @@ static inline int do_run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ -static int add_tracer_options(struct trace_array *tr, struct tracer *t); +static int add_tracer(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); +static void free_tracers(struct trace_array *tr) +{ + struct tracers *t, *n; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_safe(t, n, &tr->tracers, list) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } +} + /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer @@ -2314,6 +2337,7 @@ static void __init apply_trace_boot_options(void); */ int __init register_tracer(struct tracer *type) { + struct trace_array *tr; struct tracer *t; int ret = 0; @@ -2353,10 +2377,13 @@ int __init register_tracer(struct tracer *type) if (ret < 0) goto out; - ret = add_tracer_options(&global_trace, type); - if (ret < 0) { - pr_warn("Failed to create tracer options for %s\n", type->name); - goto out; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + ret = add_tracer(tr, type); + if (ret < 0) { + /* The tracer will still exist but without options */ + pr_warn("Failed to create tracer options for %s\n", type->name); + break; + } } type->next = trace_types; @@ -2373,7 +2400,7 @@ int __init register_tracer(struct tracer *type) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(&global_trace, type->name); + WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0); default_bootup_tracer = NULL; apply_trace_boot_options(); @@ -5139,6 +5166,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer_flags *flags; struct tracer *trace; u32 tracer_flags; int i; @@ -5152,12 +5180,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) seq_printf(m, "no%s\n", trace_options[i]); } - trace = tr->current_trace; - if (!trace->flags || !trace->flags->opts) + flags = tr->current_trace_flags; + if (!flags || !flags->opts) return 0; - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; + trace = tr->current_trace; + + tracer_flags = flags->val; + trace_opts = flags->opts; for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) @@ -5191,8 +5221,7 @@ static int __set_tracer_option(struct trace_array *tr, /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { - struct tracer *trace = tr->current_trace; - struct tracer_flags *tracer_flags = trace->flags; + struct tracer_flags *tracer_flags = tr->current_trace_flags; struct tracer_opt *opts = NULL; int i; @@ -5203,7 +5232,7 @@ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(tr, trace->flags, opts, neg); + return __set_tracer_option(tr, tracer_flags, opts, neg); } return -EINVAL; @@ -6235,11 +6264,6 @@ int tracing_update_buffers(struct trace_array *tr) return ret; } -struct trace_option_dentry; - -static int -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. @@ -6255,26 +6279,15 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; } static bool tracer_options_updated; -static int add_tracer_options(struct trace_array *tr, struct tracer *t) -{ - /* Only enable if the directory has been created already. */ - if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return 0; - - /* Only create trace option files after update_tracer_options finish */ - if (!tracer_options_updated) - return 0; - - return create_trace_option_files(tr, t); -} - int tracing_set_tracer(struct trace_array *tr, const char *buf) { - struct tracer *t; + struct tracer *trace = NULL; + struct tracers *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif @@ -6292,18 +6305,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = 0; } - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) + list_for_each_entry(t, &tr->tracers, list) { + if (strcmp(t->tracer->name, buf) == 0) { + trace = t->tracer; break; + } } - if (!t) + if (!trace) return -EINVAL; - if (t == tr->current_trace) + if (trace == tr->current_trace) return 0; #ifdef CONFIG_TRACER_SNAPSHOT - if (t->use_max_tr) { + if (trace->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; @@ -6314,14 +6329,14 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } #endif /* Some tracers won't work on kernel command line */ - if (system_state < SYSTEM_RUNNING && t->noboot) { + if (system_state < SYSTEM_RUNNING && trace->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", - t->name); + trace->name); return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) + if (!trace_ok_for_array(trace, tr)) return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ @@ -6340,8 +6355,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !t->use_max_tr) { + if (had_max_tr && !trace->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -6354,7 +6370,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (!had_max_tr && t->use_max_tr) { + if (!had_max_tr && trace->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; @@ -6363,18 +6379,21 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tr->current_trace = &nop_trace; #endif - if (t->init) { - ret = tracer_init(t, tr); + tr->current_trace_flags = t->flags ? : t->tracer->flags; + + if (trace->init) { + ret = tracer_init(trace, tr); if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE - if (t->use_max_tr) + if (trace->use_max_tr) tracing_disarm_snapshot(tr); #endif + tr->current_trace_flags = nop_trace.flags; return ret; } } - tr->current_trace = t; + tr->current_trace = trace; tr->current_trace->enabled++; trace_branch_enable(tr); @@ -9598,40 +9617,20 @@ create_trace_option_file(struct trace_array *tr, topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); - } static int -create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer, + struct tracer_flags *flags) { struct trace_option_dentry *topts; struct trace_options *tr_topts; - struct tracer_flags *flags; struct tracer_opt *opts; int cnt; - int i; - - if (!tracer) - return 0; - - flags = tracer->flags; if (!flags || !flags->opts) return 0; - /* - * If this is an instance, only create flags for tracers - * the instance may have. - */ - if (!trace_ok_for_array(tracer, tr)) - return 0; - - for (i = 0; i < tr->nr_topts; i++) { - /* Make sure there's no duplicate flags. */ - if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) - return -EINVAL; - } - opts = flags->opts; for (cnt = 0; opts[cnt].name; cnt++) @@ -9663,6 +9662,96 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return 0; } +static int get_global_flags_val(struct tracer *tracer) +{ + struct tracers *t; + + list_for_each_entry(t, &global_trace.tracers, list) { + if (t->tracer != tracer) + continue; + if (!t->flags) + return -1; + return t->flags->val; + } + return -1; +} + +static int add_tracer_options(struct trace_array *tr, struct tracers *t) +{ + struct tracer *tracer = t->tracer; + struct tracer_flags *flags = t->flags ?: tracer->flags; + + if (!flags) + return 0; + + /* Only add tracer options after update_tracer_options finish */ + if (!tracer_options_updated) + return 0; + + return create_trace_option_files(tr, tracer, flags); +} + +static int add_tracer(struct trace_array *tr, struct tracer *tracer) +{ + struct tracer_flags *flags; + struct tracers *t; + int ret; + + /* Only enable if the directory has been created already. */ + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return 0; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + t->tracer = tracer; + t->flags = NULL; + list_add(&t->list, &tr->tracers); + + flags = tracer->flags; + if (!flags) { + if (!tracer->default_flags) + return 0; + + /* + * If the tracer defines default flags, it means the flags are + * per trace instance. + */ + flags = kmalloc(sizeof(*flags), GFP_KERNEL); + if (!flags) + return -ENOMEM; + + *flags = *tracer->default_flags; + flags->trace = tracer; + + t->flags = flags; + + /* If this is an instance, inherit the global_trace flags */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + int val = get_global_flags_val(tracer); + if (!WARN_ON_ONCE(val < 0)) + flags->val = val; + } + } + + ret = add_tracer_options(tr, t); + if (ret < 0) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } + + return ret; +} + static struct dentry * create_trace_option_core_file(struct trace_array *tr, const char *option, long index) @@ -10111,13 +10200,27 @@ static void init_trace_flags_index(struct trace_array *tr) tr->trace_flags_index[i] = i; } -static int __update_tracer_options(struct trace_array *tr) +static int __update_tracer(struct trace_array *tr) { struct tracer *t; int ret = 0; for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer(tr, t); + + return ret; +} + +static __init int __update_tracer_options(struct trace_array *tr) +{ + struct tracers *t; + int ret = 0; + + list_for_each_entry(t, &tr->tracers, list) { ret = add_tracer_options(tr, t); + if (ret < 0) + break; + } return ret; } @@ -10171,7 +10274,7 @@ static int trace_array_create_dir(struct trace_array *tr) } init_tracer_tracefs(tr, tr->dir); - ret = __update_tracer_options(tr); + ret = __update_tracer(tr); if (ret) { event_trace_del_tracer(tr); tracefs_remove(tr->dir); @@ -10226,11 +10329,13 @@ trace_array_create_systems(const char *name, const char *systems, spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->tracers); INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES @@ -10403,6 +10508,7 @@ static int __remove_instance(struct trace_array *tr) free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); + free_tracers(tr); if (tr->range_name) { reserve_mem_release_by_name(tr->range_name); @@ -11437,6 +11543,7 @@ __init static int tracer_alloc_buffers(void) * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; + global_trace.current_trace_flags = nop_trace.flags; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE @@ -11450,10 +11557,7 @@ __init static int tracer_alloc_buffers(void) init_trace_flags_index(&global_trace); - register_tracer(&nop_trace); - - /* Function tracing may start here (via kernel command line) */ - init_function_trace(); + INIT_LIST_HEAD(&global_trace.tracers); /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -11474,6 +11578,11 @@ __init static int tracer_alloc_buffers(void) list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); + register_tracer(&nop_trace); + + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + apply_trace_boot_options(); register_snapshot_cmd(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8ecaf91ca823..299862aad66c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -392,6 +392,7 @@ struct trace_array { int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; + struct tracer_flags *current_trace_flags; u64 trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; @@ -406,6 +407,7 @@ struct trace_array { struct list_head systems; struct list_head events; struct list_head marker_list; + struct list_head tracers; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -637,6 +639,7 @@ struct tracer { u64 mask, int set); struct tracer *next; struct tracer_flags *flags; + struct tracer_flags *default_flags; int enabled; bool print_max; bool allow_instances; From 76680d0d2825900f23bf35290ab2b80bdf3a8e4a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2025 18:24:08 -0500 Subject: [PATCH 23/42] tracing: Have function tracer define options per instance Currently the function tracer's options are saved via a global mask when it should be per instance. Use the new infrastructure to define a "default_flags" field in the tracer structure that is used for the top level instance as well as new ones. Currently the global mask causes confusion: # cd /sys/kernel/tracing # mkdir instances/foo # echo function > instances/foo/current_tracer # echo 1 > options/func-args # echo function > current_tracer # cat trace [..] -0 [005] d..3. 1050.656187: rcu_needs_cpu() <-tick_nohz_next_event -0 [005] d..3. 1050.656188: get_next_timer_interrupt(basej=0x10002dbad, basem=0xf45fd7d300) <-tick_nohz_next_event -0 [005] d..3. 1050.656189: _raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt -0 [005] d..4. 1050.656190: do_raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt -0 [005] d..4. 1050.656191: _raw_spin_lock_nested(lock=0xffff8944bdf5f140, subclass=1) <-__get_next_timer_interrupt # cat instances/foo/options/func-args 1 # cat instances/foo/trace [..] kworker/4:1-88 [004] ...1. 298.127735: next_zone <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127736: first_online_pgdat <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127738: next_online_pgdat <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127739: fold_diff <-refresh_cpu_vm_stats kworker/4:1-88 [004] ...1. 298.127741: round_jiffies_relative <-vmstat_update [..] The above shows that updating the "func-args" option at the top level instance also updates the "func-args" option in the instance but because the update is only done by the instance that gets changed (as it should), it's confusing to see that the option is already set in the other instance. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251111232429.470883736@kernel.org Fixes: f20a580627f43 ("ftrace: Allow instances to use function tracing") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d17c18934445..c12795c2fb39 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,11 +154,11 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - func = select_trace_function(func_flags.val); + func = select_trace_function(tr->current_trace_flags->val); if (!func) return -EINVAL; - if (!handle_func_repeats(tr, func_flags.val)) + if (!handle_func_repeats(tr, tr->current_trace_flags->val)) return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -459,14 +459,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) u32 new_flags; /* Do nothing if already set. */ - if (!!set == !!(func_flags.val & bit)) + if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; /* We can change this flag only when not running. */ if (tr->current_trace != &function_trace) return 0; - new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0); func = select_trace_function(new_flags); if (!func) return -EINVAL; @@ -491,7 +491,7 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .flags = &func_flags, + .default_flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST From e29aa918a928408c3e64b0d1025e4eb9f6fc549e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2025 18:24:09 -0500 Subject: [PATCH 24/42] tracing: Have function graph tracer define options per instance Currently the function graph tracer's options are saved via a global mask when it should be per instance. Use the new infrastructure to define a "default_flags" field in the tracer structure that is used for the top level instance as well as new ones. Currently the global mask causes confusion: # cd /sys/kernel/tracing # mkdir instances/foo # echo function_graph > instances/foo/current_tracer # echo 1 > options/funcgraph-args # echo function_graph > current_tracer # cat trace [..] 2) | _raw_spin_lock_irq(lock=0xffff96b97dea16c0) { 2) 0.422 us | do_raw_spin_lock(lock=0xffff96b97dea16c0); 7) | rcu_sched_clock_irq(user=0) { 2) 1.478 us | } 7) 0.758 us | rcu_is_cpu_rrupt_from_idle(); 2) 0.647 us | enqueue_hrtimer(timer=0xffff96b97dea2058, base=0xffff96b97dea1740, mode=0); # cat instances/foo/options/funcgraph-args 1 # cat instances/foo/trace [..] 4) | __x64_sys_read() { 4) | ksys_read() { 4) 0.755 us | fdget_pos(); 4) | vfs_read() { 4) | rw_verify_area() { 4) | security_file_permission() { 4) | apparmor_file_permission() { 4) | common_file_perm() { 4) | aa_file_perm() { 4) | rcu_read_lock_held() { [..] The above shows that updating the "funcgraph-args" option at the top level instance also updates the "funcgraph-args" option in the instance but because the update is only done by the instance that gets changed (as it should), it's confusing to see that the option is already set in the other instance. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251111232429.641030027@kernel.org Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index fe9607edc8f9..4e86adf6dd4d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -101,9 +101,9 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -static bool tracer_flags_is_set(u32 flags) +static bool tracer_flags_is_set(struct trace_array *tr, u32 flags) { - return (tracer_flags.val & flags) == flags; + return (tr->current_trace_flags->val & flags) == flags; } /* @@ -263,7 +263,7 @@ static int graph_entry(struct ftrace_graph_ent *trace, trace_ctx = tracing_gen_ctx(); if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); } else { @@ -441,7 +441,7 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + if (tracer_flags_is_set(tr, TRACE_GRAPH_ARGS)) tr->gops->entryfunc = trace_graph_entry_args; else tr->gops->entryfunc = trace_graph_entry; @@ -1459,7 +1459,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + struct trace_array *tr = iter->tr; + return print_graph_function_flags(iter, tr->current_trace_flags->val); } static enum print_line_t @@ -1535,7 +1536,10 @@ static void __print_graph_headers_flags(struct trace_array *tr, static void print_graph_headers(struct seq_file *s) { - print_graph_headers_flags(s, tracer_flags.val); + struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; + + print_graph_headers_flags(s, tr->current_trace_flags->val); } void print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1660,7 +1664,7 @@ static struct tracer graph_trace __tracer_data = { .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, - .flags = &tracer_flags, + .default_flags = &tracer_flags, .set_flag = func_graph_set_flag, .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST From 97e047f44d347a4f4033c0b62f77a15a9deaf750 Mon Sep 17 00:00:00 2001 From: Yongliang Gao Date: Thu, 13 Nov 2025 08:02:52 +0800 Subject: [PATCH 25/42] trace/pid_list: optimize pid_list->lock contention When the system has many cores and task switching is frequent, setting set_ftrace_pid can cause frequent pid_list->lock contention and high system sys usage. For example, in a 288-core VM environment, we observed 267 CPUs experiencing contention on pid_list->lock, with stack traces showing: #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161 Replaces the existing spinlock with a seqlock to allow concurrent readers, while maintaining write exclusivity. Link: https://patch.msgid.link/20251113000252.1058144-1-leonylgao@gmail.com Reviewed-by: Huang Cun Signed-off-by: Yongliang Gao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/pid_list.c | 30 +++++++++++++++++++++--------- kernel/trace/pid_list.h | 1 + 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 090bb5ea4a19..dbee72d69d0a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -3,6 +3,7 @@ * Copyright (C) 2021 VMware Inc, Steven Rostedt */ #include +#include #include #include #include "trace.h" @@ -126,7 +127,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -138,14 +139,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk = pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk = upper_chunk->data[upper2]; - if (lower_chunk) - ret = test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq = read_seqcount_begin(&pid_list->seqcount); + ret = false; + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); return ret; } @@ -178,6 +181,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk = get_upper_chunk(pid_list); @@ -199,6 +203,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) set_bit(lower, lower_chunk->data); ret = 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -230,6 +235,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -250,6 +256,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -340,8 +347,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); if (upper_count <= 0 && lower_count <= 0) @@ -370,6 +379,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next = pid_list->upper_list; pid_list->upper_list = upper; @@ -380,6 +390,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list = lower; pid_list->free_lower_chunks += lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); /* @@ -419,6 +430,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); for (i = 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size From 6479325eca0148d417a82f0edcb37b58c4c0cf0a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2025 14:22:30 -0500 Subject: [PATCH 26/42] tracing: Have function graph tracer option funcgraph-irqs be per instance Currently the option to trace interrupts in the function graph tracer is global when the interface is per-instance. Changing the value in one instance will affect the results of another instance that is also running the function graph tracer. This can lead to confusing results. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251114192318.613867934@kernel.org Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 41 +++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4e86adf6dd4d..3f55b49cf64e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,7 +16,7 @@ #include "trace.h" #include "trace_output.h" -/* When set, irq functions will be ignored */ +/* When set, irq functions might be ignored */ static int ftrace_graph_skip_irqs; struct fgraph_cpu_data { @@ -190,11 +190,14 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, } #endif -static inline int ftrace_graph_ignore_irqs(void) +static inline int ftrace_graph_ignore_irqs(struct trace_array *tr) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; + if (tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + return 0; + return in_hardirq(); } @@ -238,7 +241,7 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (ftrace_graph_ignore_func(gops, trace)) return 0; - if (ftrace_graph_ignore_irqs()) + if (ftrace_graph_ignore_irqs(tr)) return 0; if (fgraph_sleep_time) { @@ -451,6 +454,9 @@ static int graph_trace_init(struct trace_array *tr) else tr->gops->retfunc = trace_graph_return; + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs++; + /* Make gops functions visible before we start tracing */ smp_mb(); @@ -468,10 +474,6 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; - /* Do nothing if the current tracer is not this tracer */ - if (tr->current_trace != &graph_trace) - return 0; - if (set) entry = trace_graph_entry_args; else @@ -492,6 +494,11 @@ static int ftrace_graph_trace_args(struct trace_array *tr, int set) static void graph_trace_reset(struct trace_array *tr) { + if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) + ftrace_graph_skip_irqs--; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + tracing_stop_cmdline_record(); unregister_ftrace_graph(tr->gops); } @@ -1617,15 +1624,29 @@ void graph_trace_close(struct trace_iterator *iter) static int func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_GRAPH_PRINT_IRQS) - ftrace_graph_skip_irqs = !set; - if (bit == TRACE_GRAPH_SLEEP_TIME) ftrace_graph_sleep_time_control(set); if (bit == TRACE_GRAPH_GRAPH_TIME) ftrace_graph_graph_time_control(set); + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + + /* Do nothing if already set. */ + if (!!set == !!(tr->current_trace_flags->val & bit)) + return 0; + + if (bit == TRACE_GRAPH_PRINT_IRQS) { + if (set) + ftrace_graph_skip_irqs--; + else + ftrace_graph_skip_irqs++; + if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) + ftrace_graph_skip_irqs = 0; + } + if (bit == TRACE_GRAPH_ARGS) return ftrace_graph_trace_args(tr, set); From 4132886e1b74d031a1de8f0e9bac44056cf57304 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2025 14:22:31 -0500 Subject: [PATCH 27/42] tracing: Move graph-time out of function graph options The option "graph-time" affects the function profiler when it is using the function graph infrastructure. It has nothing to do with the function graph tracer itself. The option only affects the global function profiler and does nothing to the function graph tracer. Move it out of the function graph tracer options and make it a global option that is only available at the top level instance. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251114192318.781711154@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 14 ++++++++++---- kernel/trace/trace.h | 13 ++++++++++++- kernel/trace/trace_functions_graph.c | 10 +--------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9268489d2ce8..8ae95800592d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -509,10 +509,10 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ - (FUNCTION_DEFAULT_FLAGS | \ - TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \ + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ - TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ TRACE_ITER(COPY_MARKER)) @@ -520,7 +520,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ - TRACE_ITER(PROF_TEXT_OFFSET)) + TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ @@ -5331,6 +5331,12 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); break; + +#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER) + case TRACE_GRAPH_GRAPH_TIME: + ftrace_graph_graph_time_control(enabled); + break; +#endif } return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 299862aad66c..41b416a22450 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1368,8 +1368,18 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #ifdef CONFIG_FUNCTION_PROFILER # define PROFILER_FLAGS \ C(PROF_TEXT_OFFSET, "prof-text-offset"), +# ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FPROFILE_FLAGS \ + C(GRAPH_TIME, "graph-time"), +# define FPROFILE_DEFAULT_FLAGS TRACE_ITER(GRAPH_TIME) +# else +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL +# endif #else # define PROFILER_FLAGS +# define FPROFILE_FLAGS +# define FPROFILE_DEFAULT_FLAGS 0UL # define TRACE_ITER_PROF_TEXT_OFFSET_BIT -1 #endif @@ -1412,7 +1422,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, FGRAPH_FLAGS \ STACK_FLAGS \ BRANCH_FLAGS \ - PROFILER_FLAGS + PROFILER_FLAGS \ + FPROFILE_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3f55b49cf64e..53adbe4bfedb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -85,11 +85,6 @@ static struct tracer_opt trace_opts[] = { /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, -#ifdef CONFIG_FUNCTION_PROFILER - /* Include time within nested functions */ - { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, -#endif - { } /* Empty entry */ }; @@ -97,7 +92,7 @@ static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | - TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, + TRACE_GRAPH_SLEEP_TIME, .opts = trace_opts }; @@ -1627,9 +1622,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_SLEEP_TIME) ftrace_graph_sleep_time_control(set); - if (bit == TRACE_GRAPH_GRAPH_TIME) - ftrace_graph_graph_time_control(set); - /* Do nothing if the current tracer is not this tracer */ if (tr->current_trace != &graph_trace) return 0; From 5abb6ccb58f0626a0b7577908bcb698b18812eed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2025 14:22:32 -0500 Subject: [PATCH 28/42] tracing: Have function graph tracer option sleep-time be per instance Currently the option to have function graph tracer to ignore time spent when a task is sleeping is global when the interface is per-instance. Changing the value in one instance will affect the results of another instance that is also running the function graph tracer. This can lead to confusing results. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251114192318.950255167@kernel.org Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 10 +---- kernel/trace/ftrace.c | 4 +- kernel/trace/trace.h | 5 +-- kernel/trace/trace_functions_graph.c | 64 +++++++++++++++++++++++----- 4 files changed, 60 insertions(+), 23 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 484ad7a18463..7fb9b169d6d4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -498,9 +498,6 @@ void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth) return get_data_type_data(current, offset); } -/* Both enabled by default (can be cleared by function_graph tracer flags */ -bool fgraph_sleep_time = true; - #ifdef CONFIG_DYNAMIC_FTRACE /* * archs can override this function if they must do something @@ -1023,11 +1020,6 @@ void fgraph_init_ops(struct ftrace_ops *dst_ops, #endif } -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h @@ -1098,7 +1090,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (fgraph_sleep_time) + if (!fgraph_no_sleep_time) return; timestamp = trace_clock_local(); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ab601cd9638b..7c3bbebeec7a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -862,6 +862,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, return 1; } +bool fprofile_no_sleep_time; + static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) @@ -887,7 +889,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, calltime = rettime - profile_data->calltime; - if (!fgraph_sleep_time) { + if (fprofile_no_sleep_time) { if (current->ftrace_sleeptime) calltime -= current->ftrace_sleeptime - profile_data->sleeptime; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 41b416a22450..58be6d741d72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -943,8 +943,6 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) -extern void ftrace_graph_sleep_time_control(bool enable); - #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else @@ -1115,7 +1113,8 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; -extern bool fgraph_sleep_time; +extern unsigned int fgraph_no_sleep_time; +extern bool fprofile_no_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 53adbe4bfedb..12315eb65925 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -19,6 +19,9 @@ /* When set, irq functions might be ignored */ static int ftrace_graph_skip_irqs; +/* Do not record function time when task is sleeping */ +unsigned int fgraph_no_sleep_time; + struct fgraph_cpu_data { pid_t last_pid; int depth; @@ -239,13 +242,14 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (ftrace_graph_ignore_irqs(tr)) return 0; - if (fgraph_sleep_time) { - /* Only need to record the calltime */ - ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); - } else { + if (fgraph_no_sleep_time && + !tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) { ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); if (ftimes) ftimes->sleeptime = current->ftrace_sleeptime; + } else { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); } if (!ftimes) return 0; @@ -331,11 +335,15 @@ void __trace_graph_return(struct trace_array *tr, trace_buffer_unlock_commit_nostack(buffer, event); } -static void handle_nosleeptime(struct ftrace_graph_ret *trace, +static void handle_nosleeptime(struct trace_array *tr, + struct ftrace_graph_ret *trace, struct fgraph_times *ftimes, int size) { - if (fgraph_sleep_time || size < sizeof(*ftimes)) + if (size < sizeof(*ftimes)) + return; + + if (!fgraph_no_sleep_time || tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) return; ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; @@ -364,7 +372,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + handle_nosleeptime(tr, trace, ftimes, size); calltime = ftimes->calltime; @@ -377,6 +385,7 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct ftrace_regs *fregs) { struct fgraph_times *ftimes; + struct trace_array *tr; int size; ftrace_graph_addr_finish(gops, trace); @@ -390,7 +399,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, if (!ftimes) return; - handle_nosleeptime(trace, ftimes, size); + tr = gops->private; + handle_nosleeptime(tr, trace, ftimes, size); if (tracing_thresh && (trace_clock_local() - ftimes->calltime < tracing_thresh)) @@ -452,6 +462,9 @@ static int graph_trace_init(struct trace_array *tr) if (!tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_IRQS)) ftrace_graph_skip_irqs++; + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time++; + /* Make gops functions visible before we start tracing */ smp_mb(); @@ -494,6 +507,11 @@ static void graph_trace_reset(struct trace_array *tr) if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) ftrace_graph_skip_irqs = 0; + if (!tracer_flags_is_set(tr, TRACE_GRAPH_SLEEP_TIME)) + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + tracing_stop_cmdline_record(); unregister_ftrace_graph(tr->gops); } @@ -1619,8 +1637,24 @@ void graph_trace_close(struct trace_iterator *iter) static int func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_GRAPH_SLEEP_TIME) - ftrace_graph_sleep_time_control(set); +/* + * The function profiler gets updated even if function graph + * isn't the current tracer. Handle it separately. + */ +#ifdef CONFIG_FUNCTION_PROFILER + if (bit == TRACE_GRAPH_SLEEP_TIME && (tr->flags & TRACE_ARRAY_FL_GLOBAL) && + !!set == fprofile_no_sleep_time) { + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + fprofile_no_sleep_time = false; + } else { + fgraph_no_sleep_time++; + fprofile_no_sleep_time = true; + } + } +#endif /* Do nothing if the current tracer is not this tracer */ if (tr->current_trace != &graph_trace) @@ -1630,6 +1664,16 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; + if (bit == TRACE_GRAPH_SLEEP_TIME) { + if (set) { + fgraph_no_sleep_time--; + if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) + fgraph_no_sleep_time = 0; + } else { + fgraph_no_sleep_time++; + } + } + if (bit == TRACE_GRAPH_PRINT_IRQS) { if (set) ftrace_graph_skip_irqs--; From bc089c47250e8923892873809471e54e05919d80 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2025 14:22:33 -0500 Subject: [PATCH 29/42] tracing: Convert function graph set_flags() to use a switch() statement Currently the set_flags() of the function graph tracer has a bunch of: if (bit == FLAG1) { [..] } if (bit == FLAG2) { [..] } To clean it up a bit, convert it over to a switch statement. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251114192319.117123664@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 12315eb65925..44d5dc5031e2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1664,7 +1664,8 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(tr->current_trace_flags->val & bit)) return 0; - if (bit == TRACE_GRAPH_SLEEP_TIME) { + switch (bit) { + case TRACE_GRAPH_SLEEP_TIME: if (set) { fgraph_no_sleep_time--; if (WARN_ON_ONCE(fgraph_no_sleep_time < 0)) @@ -1672,19 +1673,20 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) } else { fgraph_no_sleep_time++; } - } + break; - if (bit == TRACE_GRAPH_PRINT_IRQS) { + case TRACE_GRAPH_PRINT_IRQS: if (set) ftrace_graph_skip_irqs--; else ftrace_graph_skip_irqs++; if (WARN_ON_ONCE(ftrace_graph_skip_irqs < 0)) ftrace_graph_skip_irqs = 0; - } + break; - if (bit == TRACE_GRAPH_ARGS) + case TRACE_GRAPH_ARGS: return ftrace_graph_trace_args(tr, set); + } return 0; } From ac87b220a6e9530d752ab5718acc7776f9924702 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 10:47:51 -0500 Subject: [PATCH 30/42] fgraph: Make fgraph_no_sleep_time signed The variable fgraph_no_sleep_time changed from being a boolean to being a counter. A check is made to make sure that it never goes below zero. But the variable being unsigned makes the check always fail even if it does go below zero. Make the variable a signed int so that checking it going below zero still works. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20251125104751.4c9c7f28@gandalf.local.home Fixes: 5abb6ccb58f0 ("tracing: Have function graph tracer option sleep-time be per instance") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aR1yRQxDmlfLZzoo@stanley.mountain/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 58be6d741d72..da5d9527ebd6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1113,7 +1113,7 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; -extern unsigned int fgraph_no_sleep_time; +extern int fgraph_no_sleep_time; extern bool fprofile_no_sleep_time; static inline bool diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 44d5dc5031e2..d0513cfcd936 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -20,7 +20,7 @@ static int ftrace_graph_skip_irqs; /* Do not record function time when task is sleeping */ -unsigned int fgraph_no_sleep_time; +int fgraph_no_sleep_time; struct fgraph_cpu_data { pid_t last_pid; From 49c1364c7ca3577037e5ded23c30e3248434c561 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 17 Nov 2025 12:06:37 -0500 Subject: [PATCH 31/42] tracing: Remove unused variable in tracing_trace_options_show() The flags and opts used in tracing_trace_options_show() now come directly from the trace array "current_trace_flags" and not the current_trace. The variable "trace" was still being assigned to tr->current_trace but never used. This caused a warning in clang. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20251117120637.43ef995d@gandalf.local.home Reported-by: Andy Shevchenko Tested-by: Andy Shevchenko Closes: https://lore.kernel.org/all/aRtHWXzYa8ijUIDa@black.igk.intel.com/ Fixes: 428add559b692 ("tracing: Have tracer option be instance specific") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ae95800592d..59cd4ed8af6d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5167,7 +5167,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) struct tracer_opt *trace_opts; struct trace_array *tr = m->private; struct tracer_flags *flags; - struct tracer *trace; u32 tracer_flags; int i; @@ -5184,8 +5183,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) if (!flags || !flags->opts) return 0; - trace = tr->current_trace; - tracer_flags = flags->val; trace_opts = flags->opts; From 7a6735cc9b4c0b5cd6fa00c32217db8929a8c18f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 10 Nov 2025 20:18:08 +0800 Subject: [PATCH 32/42] ftrace: Avoid redundant initialization in register_ftrace_direct The FTRACE_OPS_FL_INITIALIZED flag is cleared in register_ftrace_direct, which can make it initialized by ftrace_ops_init() even if it is already initialized. It seems that there is no big deal here, but let's still fix it. Link: https://patch.msgid.link/20251110121808.1559240-1-dongml2@chinatelecom.cn Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface") Acked-by: Jiri Olsa Signed-off-by: Menglong Dong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7c3bbebeec7a..b4510a6dbf42 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6069,7 +6069,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) new_hash = NULL; ops->func = call_direct_funcs; - ops->flags = MULTI_FLAGS; + ops->flags |= MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; ops->direct_call = addr; From 23c0e9cc76bf39c10a1c1927345dfdbc54947a97 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Nov 2025 17:32:05 +0900 Subject: [PATCH 33/42] tracing: Show the tracer options in boot-time created instance Since tracer_init_tracefs_work_func() only updates the tracer options for the global_trace, the instances created by the kernel cmdline do not have those options. Fix to update tracer options for those boot-time created instances to show those options. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/176354112555.2356172.3989277078358802353.stgit@mhiramat.tok.corp.google.com Fixes: 428add559b69 ("tracing: Have tracer option be instance specific") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59cd4ed8af6d..032bdedca5d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10228,11 +10228,14 @@ static __init int __update_tracer_options(struct trace_array *tr) return ret; } -static __init void update_tracer_options(struct trace_array *tr) +static __init void update_tracer_options(void) { + struct trace_array *tr; + guard(mutex)(&trace_types_lock); tracer_options_updated = true; - __update_tracer_options(tr); + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __update_tracer_options(tr); } /* Must have trace_types_lock held */ @@ -10934,7 +10937,7 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work) create_trace_instances(NULL); - update_tracer_options(&global_trace); + update_tracer_options(); } static __init int tracer_init_tracefs(void) From bdafb4d4cb3bb18b29517eaae09fb49d25f854f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 15:08:58 -0500 Subject: [PATCH 34/42] tracing: Remove get_trigger_ops() and add count_func() from trigger ops The struct event_command has a callback function called get_trigger_ops(). This callback returns the "trigger_ops" to use for the trigger. These ops define the trigger function, how to init the trigger, how to print the trigger and how to free it. The only reason there's a callback function to get these ops is because some triggers have two types of operations. One is an "always on" operation, and the other is a "count down" operation. If a user passes in a parameter to say how many times the trigger should execute. For example: echo stacktrace:5 > events/kmem/kmem_cache_alloc/trigger It will trigger the stacktrace for the first 5 times the kmem_cache_alloc event is hit. Instead of having two different trigger_ops since the only difference between them is the tigger itself (the print, init and free functions are all the same), just use a single ops that the event_command points to and add a function field to the trigger_ops to have a count_func. When a trigger is added to an event, if there's a count attached to it and the trigger ops has the count_func field, the data allocated to represent this trigger will have a new flag set called COUNT. Then when the trigger executes, it will check if the COUNT data flag is set, and if so, it will call the ops count_func(). If that returns false, it returns without executing the trigger. This removes the need for duplicate event_trigger_ops structures. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251125200932.274566147@kernel.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 26 ++- kernel/trace/trace_eprobe.c | 8 +- kernel/trace/trace_events_hist.c | 60 +------ kernel/trace/trace_events_trigger.c | 257 ++++++++++------------------ 4 files changed, 116 insertions(+), 235 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index da5d9527ebd6..b9c59d9f9a0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1791,6 +1791,7 @@ extern void clear_event_triggers(struct trace_array *tr); enum { EVENT_TRIGGER_FL_PROBE = BIT(0), + EVENT_TRIGGER_FL_COUNT = BIT(1), }; struct event_trigger_data { @@ -1822,6 +1823,10 @@ struct enable_trigger_data { bool hist; }; +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event); + extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_data *data); @@ -1909,6 +1914,11 @@ extern void event_file_put(struct trace_event_file *file); * registered the trigger (see struct event_command) along with * the trace record, rec. * + * @count_func: If defined and a numeric parameter is passed to the + * trigger, then this function will be called before @trigger + * is called. If this function returns false, then @trigger is not + * executed. + * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() * function). This can be used to perform per-trigger @@ -1936,6 +1946,10 @@ struct event_trigger_ops { struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); + bool (*count_func)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_data *data); void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, @@ -1962,6 +1976,9 @@ struct event_trigger_ops { * @name: The unique name that identifies the event command. This is * the name used when setting triggers via trigger files. * + * @trigger_ops: The event_trigger_ops implementation associated with + * the command. + * * @trigger_type: A unique id that identifies the event command * 'type'. This value has two purposes, the first to ensure that * only one trigger of the same type can be set at a given time @@ -2013,17 +2030,11 @@ struct event_trigger_ops { * event command, filters set by the user for the command will be * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). - * - * @get_trigger_ops: The callback function invoked to retrieve the - * event_trigger_ops implementation associated with the command. - * This callback function allows a single event_command to - * support multiple trigger implementations via different sets of - * event_trigger_ops, depending on the value of the @param - * string. */ struct event_command { struct list_head list; char *name; + const struct event_trigger_ops *trigger_ops; enum event_trigger_type trigger_type; int flags; int (*parse)(struct event_command *cmd_ops, @@ -2040,7 +2051,6 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a1d402124836..14ae896dbe75 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -513,21 +513,15 @@ static void eprobe_trigger_unreg_func(char *glob, } -static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) -{ - return &eprobe_trigger_ops; -} - static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, + .trigger_ops = &eprobe_trigger_ops, .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, - .get_trigger_ops = eprobe_trigger_get_ops, .set_filter = NULL, }; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1d536219b624..f9cc8d6a215b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6363,12 +6363,6 @@ static const struct event_trigger_ops event_hist_trigger_named_ops = { .free = event_hist_trigger_named_free, }; -static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) -{ - return &event_hist_trigger_ops; -} - static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -6908,11 +6902,11 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, + .trigger_ops = &event_hist_trigger_ops, .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, - .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -6945,29 +6939,9 @@ hist_enable_trigger(struct event_trigger_data *data, } } -static void -hist_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - hist_enable_trigger(data, buffer, rec, event); -} - static const struct event_trigger_ops hist_enable_trigger_ops = { .trigger = hist_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_enable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, + .count_func = event_trigger_count, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, @@ -6975,36 +6949,12 @@ static const struct event_trigger_ops hist_enable_count_trigger_ops = { static const struct event_trigger_ops hist_disable_trigger_ops = { .trigger = hist_enable_trigger, + .count_func = event_trigger_count, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static const struct event_trigger_ops hist_disable_count_trigger_ops = { - .trigger = hist_enable_count_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops * -hist_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - - enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); - - if (enable) - ops = param ? &hist_enable_count_trigger_ops : - &hist_enable_trigger_ops; - else - ops = param ? &hist_disable_count_trigger_ops : - &hist_disable_trigger_ops; - - return ops; -} - static void hist_enable_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; @@ -7023,22 +6973,22 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, + .trigger_ops = &hist_enable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, + .trigger_ops = &hist_disable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, - .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cbfc306c0159..576bad18bcdb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,6 +28,20 @@ void trigger_data_free(struct event_trigger_data *data) kfree(data); } +static inline void data_ops_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + const struct event_trigger_ops *ops = data->ops; + + if (data->flags & EVENT_TRIGGER_FL_COUNT) { + if (!ops->count_func(data, buffer, rec, event)) + return; + } + + ops->trigger(data, buffer, rec, event); +} + /** * event_triggers_call - Call triggers associated with a trace event * @file: The trace_event_file associated with the event @@ -70,7 +84,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -80,7 +94,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->trigger(data, buffer, rec, event); + data_ops_trigger(data, buffer, rec, event); } return tt; } @@ -122,7 +136,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->trigger(data, NULL, NULL, NULL); + data_ops_trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -377,6 +391,36 @@ __init int unregister_event_command(struct event_command *cmd) return -ENODEV; } +/** + * event_trigger_count - Optional count function for event triggers + * @data: Trigger-specific data + * @buffer: The ring buffer that the event is being written to + * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer + * + * For triggers that can take a count parameter that doesn't do anything + * special, they can use this function to assign to their .count_func + * field. + * + * This simply does a count down of the @data->count field. + * + * If the @data->count is greater than zero, it will decrement it. + * + * Returns false if @data->count is zero, otherwise true. + */ +bool event_trigger_count(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return false; + + if (data->count != -1) + (data->count)--; + + return true; +} + /** * event_trigger_print - Generic event_trigger_ops @print implementation * @name: The name of the event trigger @@ -807,9 +851,13 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, * @private_data: User data to associate with the event trigger * * Allocate an event_trigger_data instance and initialize it. The - * @cmd_ops are used along with the @cmd and @param to get the - * trigger_ops to assign to the event_trigger_data. @private_data can - * also be passed in and associated with the event_trigger_data. + * @cmd_ops defines how the trigger will operate. If @param is set, + * and @cmd_ops->trigger_ops->count_func is non NULL, then the + * data->count is set to @param and before the trigger is executed, the + * @cmd_ops->trigger_ops->count_func() is called. If that function returns + * false, the @cmd_ops->trigger_ops->trigger() function will not be called. + * @private_data can also be passed in and associated with the + * event_trigger_data. * * Use trigger_data_free() to free an event_trigger_data object. * @@ -821,18 +869,17 @@ struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, void *private_data) { struct event_trigger_data *trigger_data; - const struct event_trigger_ops *trigger_ops; - - trigger_ops = cmd_ops->get_trigger_ops(cmd, param); trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); if (!trigger_data) return NULL; trigger_data->count = -1; - trigger_data->ops = trigger_ops; + trigger_data->ops = cmd_ops->trigger_ops; trigger_data->cmd_ops = cmd_ops; trigger_data->private_data = private_data; + if (param && cmd_ops->trigger_ops->count_func) + trigger_data->flags |= EVENT_TRIGGER_FL_COUNT; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1271,31 +1318,28 @@ traceon_trigger(struct event_trigger_data *data, tracing_on(); } -static void -traceon_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceon_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_on(file->tr); - else - tracing_on(); + return true; } static void @@ -1319,31 +1363,28 @@ traceoff_trigger(struct event_trigger_data *data, tracing_off(); } -static void -traceoff_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +traceoff_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; if (file) { if (!tracer_tracing_is_on(file->tr)) - return; + return false; } else { if (!tracing_is_on()) - return; + return false; } if (!data->count) - return; + return false; if (data->count != -1) (data->count)--; - if (file) - tracer_tracing_off(file->tr); - else - tracing_off(); + return true; } static int @@ -1362,13 +1403,7 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) static const struct event_trigger_ops traceon_trigger_ops = { .trigger = traceon_trigger, - .print = traceon_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops traceon_count_trigger_ops = { - .trigger = traceon_count_trigger, + .count_func = traceon_count_func, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1376,41 +1411,19 @@ static const struct event_trigger_ops traceon_count_trigger_ops = { static const struct event_trigger_ops traceoff_trigger_ops = { .trigger = traceoff_trigger, + .count_func = traceoff_count_func, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static const struct event_trigger_ops traceoff_count_trigger_ops = { - .trigger = traceoff_count_trigger, - .print = traceoff_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -onoff_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = param ? &traceon_count_trigger_ops : - &traceon_trigger_ops; - else - ops = param ? &traceoff_count_trigger_ops : - &traceoff_trigger_ops; - - return ops; -} - static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, + .trigger_ops = &traceon_trigger_ops, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1418,10 +1431,10 @@ static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, .flags = EVENT_CMD_FL_POST_TRIGGER, + .trigger_ops = &traceoff_trigger_ops, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = onoff_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1439,20 +1452,6 @@ snapshot_trigger(struct event_trigger_data *data, tracing_snapshot(); } -static void -snapshot_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - snapshot_trigger(data, buffer, rec, event); -} - static int register_snapshot_trigger(char *glob, struct event_trigger_data *data, @@ -1486,31 +1485,19 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) static const struct event_trigger_ops snapshot_trigger_ops = { .trigger = snapshot_trigger, + .count_func = event_trigger_count, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static const struct event_trigger_ops snapshot_count_trigger_ops = { - .trigger = snapshot_count_trigger, - .print = snapshot_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -snapshot_get_trigger_ops(char *cmd, char *param) -{ - return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; -} - static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, + .trigger_ops = &snapshot_trigger_ops, .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_snapshot_trigger, - .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1558,20 +1545,6 @@ stacktrace_trigger(struct event_trigger_data *data, trace_dump_stack(STACK_SKIP); } -static void -stacktrace_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) -{ - if (!data->count) - return; - - if (data->count != -1) - (data->count)--; - - stacktrace_trigger(data, buffer, rec, event); -} - static int stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { @@ -1581,32 +1554,20 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) static const struct event_trigger_ops stacktrace_trigger_ops = { .trigger = stacktrace_trigger, + .count_func = event_trigger_count, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static const struct event_trigger_ops stacktrace_count_trigger_ops = { - .trigger = stacktrace_count_trigger, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - -static const struct event_trigger_ops * -stacktrace_get_trigger_ops(char *cmd, char *param) -{ - return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; -} - static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, + .trigger_ops = &stacktrace_trigger_ops, .flags = EVENT_CMD_FL_POST_TRIGGER, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, - .get_trigger_ops = stacktrace_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1642,24 +1603,24 @@ event_enable_trigger(struct event_trigger_data *data, set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); } -static void -event_enable_count_trigger(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *event) +static bool +event_enable_count_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; if (!data->count) - return; + return false; /* Skip if the event is in a state we want to switch to */ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) - return; + return false; if (data->count != -1) (data->count)--; - event_enable_trigger(data, buffer, rec, event); + return true; } int event_enable_trigger_print(struct seq_file *m, @@ -1706,13 +1667,7 @@ void event_enable_trigger_free(struct event_trigger_data *data) static const struct event_trigger_ops event_enable_trigger_ops = { .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_enable_count_trigger_ops = { - .trigger = event_enable_count_trigger, + .count_func = event_enable_count_func, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, @@ -1720,13 +1675,7 @@ static const struct event_trigger_ops event_enable_count_trigger_ops = { static const struct event_trigger_ops event_disable_trigger_ops = { .trigger = event_enable_trigger, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_count_trigger_ops = { - .trigger = event_enable_count_trigger, + .count_func = event_enable_count_func, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, @@ -1906,45 +1855,23 @@ void event_enable_unregister_trigger(char *glob, data->ops->free(data); } -static const struct event_trigger_ops * -event_enable_get_trigger_ops(char *cmd, char *param) -{ - const struct event_trigger_ops *ops; - bool enable; - -#ifdef CONFIG_HIST_TRIGGERS - enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || - (strcmp(cmd, ENABLE_HIST_STR) == 0)); -#else - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; -#endif - if (enable) - ops = param ? &event_enable_count_trigger_ops : - &event_enable_trigger_ops; - else - ops = param ? &event_disable_count_trigger_ops : - &event_disable_trigger_ops; - - return ops; -} - static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, + .trigger_ops = &event_enable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, + .trigger_ops = &event_disable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, - .get_trigger_ops = event_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; From b052d70f7c9c156409a70e65c10d83b5650e7e78 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 15:08:59 -0500 Subject: [PATCH 35/42] tracing: Merge struct event_trigger_ops into struct event_command Now that there's pretty much a one to one mapping between the struct event_trigger_ops and struct event_command, there's no reason to have two different structures. Merge the function pointers of event_trigger_ops into event_command. There's one exception in trace_events_hist.c for the event_hist_trigger_named_ops. This has special logic for the init and free function pointers for "named histograms". In this case, allocate the cmd_ops of the event_trigger_data and set it to the proper init and free functions, which are used to initialize and free the event_trigger_data respectively. Have the free function and the init function (on failure) free the cmd_ops of the data element. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://patch.msgid.link/20251125200932.446322765@kernel.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 182 ++++++++++++---------------- kernel/trace/trace_eprobe.c | 13 +- kernel/trace/trace_events_hist.c | 93 +++++++------- kernel/trace/trace_events_trigger.c | 135 +++++++++------------ 4 files changed, 186 insertions(+), 237 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b9c59d9f9a0c..901aad30099b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1798,7 +1798,6 @@ struct event_trigger_data { unsigned long count; int ref; int flags; - const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1890,20 +1889,76 @@ extern void event_file_get(struct trace_event_file *file); extern void event_file_put(struct trace_event_file *file); /** - * struct event_trigger_ops - callbacks for trace event triggers + * struct event_command - callbacks and data members for event commands * - * The methods in this structure provide per-event trigger hooks for - * various trigger operations. + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. * - * The @init and @free methods are used during trigger setup and - * teardown, typically called from an event_command's @parse() - * function implementation. + * The data members in this structure provide per-event command data + * for various event commands. * - * The @print method is used to print the trigger spec. + * All the data members below, except for @post_trigger, must be set + * for each event command. * - * The @trigger method is the function that actually implements the - * trigger and is called in the context of the triggering event - * whenever that event occurs. + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/trace_events.h. + * + * @flags: See the enum event_command_flags below. + * + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. + * + * @parse: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_command @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_command @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). * * All the methods below, except for @init() and @free(), must be * implemented. @@ -1941,100 +1996,9 @@ extern void event_file_put(struct trace_event_file *file); * that calls the generic utility function @event_trigger_print() * (see trace_event_triggers.c). */ -struct event_trigger_ops { - void (*trigger)(struct event_trigger_data *data, - struct trace_buffer *buffer, - void *rec, - struct ring_buffer_event *rbe); - bool (*count_func)(struct event_trigger_data *data, - struct trace_buffer *buffer, - void *rec, - struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_data *data); - void (*free)(struct event_trigger_data *data); - int (*print)(struct seq_file *m, - struct event_trigger_data *data); -}; - -/** - * struct event_command - callbacks and data members for event commands - * - * Event commands are invoked by users by writing the command name - * into the 'trigger' file associated with a trace event. The - * parameters associated with a specific invocation of an event - * command are used to create an event trigger instance, which is - * added to the list of trigger instances associated with that trace - * event. When the event is hit, the set of triggers associated with - * that event is invoked. - * - * The data members in this structure provide per-event command data - * for various event commands. - * - * All the data members below, except for @post_trigger, must be set - * for each event command. - * - * @name: The unique name that identifies the event command. This is - * the name used when setting triggers via trigger files. - * - * @trigger_ops: The event_trigger_ops implementation associated with - * the command. - * - * @trigger_type: A unique id that identifies the event command - * 'type'. This value has two purposes, the first to ensure that - * only one trigger of the same type can be set at a given time - * for a particular event e.g. it doesn't make sense to have both - * a traceon and traceoff trigger attached to a single event at - * the same time, so traceon and traceoff have the same type - * though they have different names. The @trigger_type value is - * also used as a bit value for deferring the actual trigger - * action until after the current event is finished. Some - * commands need to do this if they themselves log to the trace - * buffer (see the @post_trigger() member below). @trigger_type - * values are defined by adding new values to the trigger_type - * enum in include/linux/trace_events.h. - * - * @flags: See the enum event_command_flags below. - * - * All the methods below, except for @set_filter() and @unreg_all(), - * must be implemented. - * - * @parse: The callback function responsible for parsing and - * registering the trigger written to the 'trigger' file by the - * user. It allocates the trigger instance and registers it with - * the appropriate trace event. It makes use of the other - * event_command callback functions to orchestrate this, and is - * usually implemented by the generic utility function - * @event_trigger_callback() (see trace_event_triggers.c). - * - * @reg: Adds the trigger to the list of triggers associated with the - * event, and enables the event trigger itself, after - * initializing it (via the event_trigger_ops @init() function). - * This is also where commands can use the @trigger_type value to - * make the decision as to whether or not multiple instances of - * the trigger should be allowed. This is usually implemented by - * the generic utility function @register_trigger() (see - * trace_event_triggers.c). - * - * @unreg: Removes the trigger from the list of triggers associated - * with the event, and disables the event trigger itself, after - * initializing it (via the event_trigger_ops @free() function). - * This is usually implemented by the generic utility function - * @unregister_trigger() (see trace_event_triggers.c). - * - * @unreg_all: An optional function called to remove all the triggers - * from the list of triggers associated with the event. Called - * when a trigger file is opened in truncate mode. - * - * @set_filter: An optional function called to parse and set a filter - * for the trigger. If no @set_filter() method is set for the - * event command, filters set by the user for the command will be - * ignored. This is usually implemented by the generic utility - * function @set_trigger_filter() (see trace_event_triggers.c). - */ struct event_command { struct list_head list; char *name; - const struct event_trigger_ops *trigger_ops; enum event_trigger_type trigger_type; int flags; int (*parse)(struct event_command *cmd_ops, @@ -2051,6 +2015,18 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + bool (*count_func)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_data *data); }; /** @@ -2071,7 +2047,7 @@ struct event_command { * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation + * words, if the event_command @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 14ae896dbe75..f3e0442c3b96 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -484,13 +484,6 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static const struct event_trigger_ops eprobe_trigger_ops = { - .trigger = eprobe_trigger_func, - .print = eprobe_trigger_print, - .init = eprobe_trigger_init, - .free = eprobe_trigger_free, -}; - static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, @@ -517,12 +510,15 @@ static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, - .trigger_ops = &eprobe_trigger_ops, .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, .set_filter = NULL, + .trigger = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, }; static struct event_trigger_data * @@ -542,7 +538,6 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->flags = EVENT_TRIGGER_FL_PROBE; trigger->count = -1; - trigger->ops = &eprobe_trigger_ops; /* * EVENT PROBE triggers are not registered as commands with diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f9cc8d6a215b..f0dafc1f2787 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5694,7 +5694,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -6016,7 +6016,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data); + data->cmd_ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -6326,20 +6326,21 @@ static void event_hist_trigger_free(struct event_trigger_data *data) free_hist_pad(); } -static const struct event_trigger_ops event_hist_trigger_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_init, - .free = event_hist_trigger_free, -}; - static int event_hist_trigger_named_init(struct event_trigger_data *data) { + int ret; + data->ref++; save_named_trigger(data->named_data->name, data); - return event_hist_trigger_init(data->named_data); + ret = event_hist_trigger_init(data->named_data); + if (ret < 0) { + kfree(data->cmd_ops); + data->cmd_ops = &trigger_hist_cmd; + } + + return ret; } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6351,18 +6352,14 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) data->ref--; if (!data->ref) { + struct event_command *cmd_ops = data->cmd_ops; + del_named_trigger(data); trigger_data_free(data); + kfree(cmd_ops); } } -static const struct event_trigger_ops event_hist_trigger_named_ops = { - .trigger = event_hist_trigger, - .print = event_hist_trigger_print, - .init = event_hist_trigger_named_init, - .free = event_hist_trigger_named_free, -}; - static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -6556,13 +6553,24 @@ static int hist_register_trigger(char *glob, data->paused = true; if (named_data) { + struct event_command *cmd_ops; + data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); - data->ops = &event_hist_trigger_named_ops; + /* Copy the command ops and update some of the functions */ + cmd_ops = kmalloc(sizeof(*cmd_ops), GFP_KERNEL); + if (!cmd_ops) { + ret = -ENOMEM; + goto out; + } + *cmd_ops = *data->cmd_ops; + cmd_ops->init = event_hist_trigger_named_init; + cmd_ops->free = event_hist_trigger_named_free; + data->cmd_ops = cmd_ops; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) goto out; } @@ -6676,8 +6684,8 @@ static void hist_unregister_trigger(char *glob, } } - if (test && test->ops->free) - test->ops->free(test); + if (test && test->cmd_ops->free) + test->cmd_ops->free(test); if (hist_data->enable_timestamps) { if (!hist_data->remove || test) @@ -6729,8 +6737,8 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6902,12 +6910,15 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, - .trigger_ops = &event_hist_trigger_ops, .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, .set_filter = set_trigger_filter, + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, }; __init int register_trigger_hist_cmd(void) @@ -6939,22 +6950,6 @@ hist_enable_trigger(struct event_trigger_data *data, } } -static const struct event_trigger_ops hist_enable_trigger_ops = { - .trigger = hist_enable_trigger, - .count_func = event_trigger_count, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops hist_disable_trigger_ops = { - .trigger = hist_enable_trigger, - .count_func = event_trigger_count, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - static void hist_enable_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; @@ -6964,8 +6959,8 @@ static void hist_enable_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); - if (test->ops->free) - test->ops->free(test); + if (test->cmd_ops->free) + test->cmd_ops->free(test); } } } @@ -6973,23 +6968,31 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .trigger_ops = &hist_enable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .trigger_ops = &hist_disable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, .set_filter = set_trigger_filter, + .trigger = hist_enable_trigger, + .count_func = event_trigger_count, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_hist_enable_disable_cmds(void) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 576bad18bcdb..7795af600466 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -32,14 +32,14 @@ static inline void data_ops_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - const struct event_trigger_ops *ops = data->ops; + const struct event_command *cmd_ops = data->cmd_ops; if (data->flags & EVENT_TRIGGER_FL_COUNT) { - if (!ops->count_func(data, buffer, rec, event)) + if (!cmd_ops->count_func(data, buffer, rec, event)) return; } - ops->trigger(data, buffer, rec, event); + cmd_ops->trigger(data, buffer, rec, event); } /** @@ -205,7 +205,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data); + data->cmd_ops->print(m, data); return 0; } @@ -422,7 +422,7 @@ bool event_trigger_count(struct event_trigger_data *data, } /** - * event_trigger_print - Generic event_trigger_ops @print implementation + * event_trigger_print - Generic event_command @print implementation * @name: The name of the event trigger * @m: The seq_file being printed to * @data: Trigger-specific data @@ -457,7 +457,7 @@ event_trigger_print(const char *name, struct seq_file *m, } /** - * event_trigger_init - Generic event_trigger_ops @init implementation + * event_trigger_init - Generic event_command @init implementation * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -474,7 +474,7 @@ int event_trigger_init(struct event_trigger_data *data) } /** - * event_trigger_free - Generic event_trigger_ops @free implementation + * event_trigger_free - Generic event_command @free implementation * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -536,8 +536,8 @@ clear_event_triggers(struct trace_array *tr) list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); } } } @@ -600,8 +600,8 @@ static int register_trigger(char *glob, return -EEXIST; } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) return ret; } @@ -639,8 +639,8 @@ static bool try_unregister_trigger(char *glob, } if (data) { - if (data->ops->free) - data->ops->free(data); + if (data->cmd_ops->free) + data->cmd_ops->free(data); return true; } @@ -875,10 +875,9 @@ struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, return NULL; trigger_data->count = -1; - trigger_data->ops = cmd_ops->trigger_ops; trigger_data->cmd_ops = cmd_ops; trigger_data->private_data = private_data; - if (param && cmd_ops->trigger_ops->count_func) + if (param && cmd_ops->count_func) trigger_data->flags |= EVENT_TRIGGER_FL_COUNT; INIT_LIST_HEAD(&trigger_data->list); @@ -1401,7 +1400,13 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops traceon_trigger_ops = { +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .parse = event_trigger_parse, + .reg = register_trigger, + .unreg = unregister_trigger, + .set_filter = set_trigger_filter, .trigger = traceon_trigger, .count_func = traceon_count_func, .print = traceon_trigger_print, @@ -1409,7 +1414,14 @@ static const struct event_trigger_ops traceon_trigger_ops = { .free = event_trigger_free, }; -static const struct event_trigger_ops traceoff_trigger_ops = { +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .flags = EVENT_CMD_FL_POST_TRIGGER, + .parse = event_trigger_parse, + .reg = register_trigger, + .unreg = unregister_trigger, + .set_filter = set_trigger_filter, .trigger = traceoff_trigger, .count_func = traceoff_count_func, .print = traceoff_trigger_print, @@ -1417,27 +1429,6 @@ static const struct event_trigger_ops traceoff_trigger_ops = { .free = event_trigger_free, }; -static struct event_command trigger_traceon_cmd = { - .name = "traceon", - .trigger_type = ETT_TRACE_ONOFF, - .trigger_ops = &traceon_trigger_ops, - .parse = event_trigger_parse, - .reg = register_trigger, - .unreg = unregister_trigger, - .set_filter = set_trigger_filter, -}; - -static struct event_command trigger_traceoff_cmd = { - .name = "traceoff", - .trigger_type = ETT_TRACE_ONOFF, - .flags = EVENT_CMD_FL_POST_TRIGGER, - .trigger_ops = &traceoff_trigger_ops, - .parse = event_trigger_parse, - .reg = register_trigger, - .unreg = unregister_trigger, - .set_filter = set_trigger_filter, -}; - #ifdef CONFIG_TRACER_SNAPSHOT static void snapshot_trigger(struct event_trigger_data *data, @@ -1483,7 +1474,13 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops snapshot_trigger_ops = { +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .parse = event_trigger_parse, + .reg = register_snapshot_trigger, + .unreg = unregister_snapshot_trigger, + .set_filter = set_trigger_filter, .trigger = snapshot_trigger, .count_func = event_trigger_count, .print = snapshot_trigger_print, @@ -1491,16 +1488,6 @@ static const struct event_trigger_ops snapshot_trigger_ops = { .free = event_trigger_free, }; -static struct event_command trigger_snapshot_cmd = { - .name = "snapshot", - .trigger_type = ETT_SNAPSHOT, - .trigger_ops = &snapshot_trigger_ops, - .parse = event_trigger_parse, - .reg = register_snapshot_trigger, - .unreg = unregister_snapshot_trigger, - .set_filter = set_trigger_filter, -}; - static __init int register_trigger_snapshot_cmd(void) { int ret; @@ -1552,23 +1539,19 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static const struct event_trigger_ops stacktrace_trigger_ops = { - .trigger = stacktrace_trigger, - .count_func = event_trigger_count, - .print = stacktrace_trigger_print, - .init = event_trigger_init, - .free = event_trigger_free, -}; - static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .trigger_ops = &stacktrace_trigger_ops, .flags = EVENT_CMD_FL_POST_TRIGGER, .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .set_filter = set_trigger_filter, + .trigger = stacktrace_trigger, + .count_func = event_trigger_count, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, }; static __init int register_trigger_stacktrace_cmd(void) @@ -1665,22 +1648,6 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static const struct event_trigger_ops event_enable_trigger_ops = { - .trigger = event_enable_trigger, - .count_func = event_enable_count_func, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - -static const struct event_trigger_ops event_disable_trigger_ops = { - .trigger = event_enable_trigger, - .count_func = event_enable_count_func, - .print = event_enable_trigger_print, - .init = event_trigger_init, - .free = event_enable_trigger_free, -}; - int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param_and_filter) @@ -1810,8 +1777,8 @@ int event_enable_register_trigger(char *glob, } } - if (data->ops->init) { - ret = data->ops->init(data); + if (data->cmd_ops->init) { + ret = data->cmd_ops->init(data); if (ret < 0) return ret; } @@ -1851,28 +1818,36 @@ void event_enable_unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); + if (data && data->cmd_ops->free) + data->cmd_ops->free(data); } static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .trigger_ops = &event_enable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .trigger_ops = &event_disable_trigger_ops, .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .set_filter = set_trigger_filter, + .trigger = event_enable_trigger, + .count_func = event_enable_count_func, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, }; static __init void unregister_trigger_enable_disable_cmds(void) From 78c7051394945bb2a26993f289e935c922070872 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 16:40:05 -0500 Subject: [PATCH 36/42] tracing: Remove unneeded event_mutex lock in event_trigger_regex_release() In event_trigger_regex_release(), the only code is: mutex_lock(&event_mutex); if (file->f_mode & FMODE_READ) seq_release(inode, file); mutex_unlock(&event_mutex); return 0; There's nothing special about the file->f_mode or the seq_release() that requires any locking. Remove the unnecessary locks. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tom Zanussi Link: https://patch.msgid.link/20251125214031.975879283@kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 7795af600466..e5dcfcbb2cd5 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -314,13 +314,9 @@ static ssize_t event_trigger_regex_write(struct file *file, static int event_trigger_regex_release(struct inode *inode, struct file *file) { - mutex_lock(&event_mutex); - if (file->f_mode & FMODE_READ) seq_release(inode, file); - mutex_unlock(&event_mutex); - return 0; } From 61d445af0a7c70018111919e47beaaee15653f2f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 16:40:06 -0500 Subject: [PATCH 37/42] tracing: Add bulk garbage collection of freeing event_trigger_data The event trigger data requires a full tracepoint_synchronize_unregister() call before freeing. That call can take 100s of milliseconds to complete. In order to allow for bulk freeing of the trigger data, it can not call the tracepoint_synchronize_unregister() for every individual trigger data being free. Create a kthread that gets created the first time a trigger data is freed, and have it use the lockless llist to get the list of data to free, run the tracepoint_synchronize_unregister() then free everything in the list. By freeing hundreds of event_trigger_data elements together, it only requires two runs of the synchronization function, and not hundreds of runs. This speeds up the operation by orders of magnitude (milliseconds instead of several seconds). Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tom Zanussi Link: https://patch.msgid.link/20251125214032.151674992@kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events_trigger.c | 55 +++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 901aad30099b..a3aa225ed50a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "pid_list.h" @@ -1808,6 +1809,7 @@ struct event_trigger_data { char *name; struct list_head named_list; struct event_trigger_data *named_data; + struct llist_node llist; }; /* Avoid typos */ diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e5dcfcbb2cd5..3b97c242b795 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -17,15 +18,63 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); +static struct task_struct *trigger_kthread; +static struct llist_head trigger_data_free_list; +static DEFINE_MUTEX(trigger_data_kthread_mutex); + +/* Bulk garbage collection of event_trigger_data elements */ +static int trigger_kthread_fn(void *ignore) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + /* Once this task starts, it lives forever */ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (llist_empty(&trigger_data_free_list)) + schedule(); + + __set_current_state(TASK_RUNNING); + + llnodes = llist_del_all(&trigger_data_free_list); + + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); + } + + return 0; +} + void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - /* make sure current triggers exit before free */ - tracepoint_synchronize_unregister(); + if (unlikely(!trigger_kthread)) { + guard(mutex)(&trigger_data_kthread_mutex); + /* Check again after taking mutex */ + if (!trigger_kthread) { + struct task_struct *kthread; - kfree(data); + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } + } + + if (!trigger_kthread) { + /* Do it the slow way */ + tracepoint_synchronize_unregister(); + kfree(data); + return; + } + + llist_add(&data->llist, &trigger_data_free_list); + wake_up_process(trigger_kthread); } static inline void data_ops_trigger(struct event_trigger_data *data, From 400ddf1dbe70429b5fc6cef74d987829e6c25893 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2025 16:40:07 -0500 Subject: [PATCH 38/42] tracing: Use strim() in trigger_process_regex() instead of skip_spaces() The function trigger_process_regex() is called by a few functions, where only one calls strim() on the buffer passed to it. That leaves the other functions not trimming the end of the buffer passed in and making it a little inconsistent. Remove the strim() from event_trigger_regex_write() and have trigger_process_regex() use strim() instead of skip_spaces(). The buff variable is not passed in as const, so it can be modified. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tom Zanussi Link: https://patch.msgid.link/20251125214032.323747707@kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3b97c242b795..96aad82b1628 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -308,7 +308,8 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) char *command, *next; struct event_command *p; - next = buff = skip_spaces(buff); + next = buff = strim(buff); + command = strsep(&next, ": \t"); if (next) { next = skip_spaces(next); @@ -345,8 +346,6 @@ static ssize_t event_trigger_regex_write(struct file *file, if (IS_ERR(buf)) return PTR_ERR(buf); - strim(buf); - guard(mutex)(&event_mutex); event_file = event_file_file(file); From f93a7d0caccd6ab76dacfd620013cfc41f49fb8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2025 18:15:14 -0500 Subject: [PATCH 39/42] ftrace: Allow tracing of some of the tracing code There is times when tracing the tracing infrastructure can be useful for debugging the tracing code. Currently all files in the tracing directory are set to "notrace" the functions. Add a new config option FUNCTION_SELF_TRACING that will allow some of the files in the tracing infrastructure to be traced. It requires a config to enable because it will add noise to the function tracer if events and other tracing features are enabled. Tracing functions and events together is quite common, so not tracing the event code should be the default. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Mark Rutland Cc: Tom Zanussi Link: https://patch.msgid.link/20251120181514.736f2d5f@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/Makefile | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 99283b2dcfd6..e1214b9dc990 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -330,6 +330,20 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config FUNCTION_SELF_TRACING + bool "Function trace tracing code" + depends on FUNCTION_TRACER + help + Normally all the tracing code is set to notrace, where the function + tracer will ignore all the tracing functions. Sometimes it is useful + for debugging to trace some of the tracing infratructure itself. + Enable this to allow some of the tracing infrastructure to be traced + by the function tracer. Note, this will likely add noise to function + tracing if events and other tracing features are enabled along with + function tracing. + + If unsure, say N. + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index dcb4e02afc5f..fc5dcc888e13 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -16,6 +16,23 @@ obj-y += trace_selftest_dynamic.o endif endif +# Allow some files to be function traced +ifdef CONFIG_FUNCTION_SELF_TRACING +CFLAGS_trace_output.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_seq.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_stat.o = $(CC_FLAGS_FTRACE) +CFLAGS_tracing_map.o = $(CC_FLAGS_FTRACE) +CFLAGS_synth_event_gen_test.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_syscalls.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_filter.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_trigger.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_synth.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_hist.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_events_user.o = $(CC_FLAGS_FTRACE) +CFLAGS_trace_dynevent.o = $(CC_FLAGS_FTRACE) +endif + ifdef CONFIG_FTRACE_STARTUP_TEST CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o From 20e7168326f5ccab0fc5c322af31ae6200012137 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 22 Nov 2025 09:31:40 +0900 Subject: [PATCH 40/42] tracing: Add boot-time backup of persistent ring buffer Currently, the persistent ring buffer instance needs to be read before using it. This means we have to wait for boot up user space and dump the persistent ring buffer. However, in that case we can not start tracing on it from the kernel cmdline. To solve this limitation, this adds an option which allows to create a trace instance as a backup of the persistent ring buffer at boot. If user specifies trace_instance== then the instance is made as a copy of the instance. For example, the below kernel cmdline records all syscalls, scheduler and interrupt events on the persistent ring buffer `boot_map` but before starting the tracing, it makes a `backup` instance from the `boot_map`. Thus, the `backup` instance has the previous boot events. 'reserve_mem=12M:4M:trace trace_instance=boot_map@trace,syscalls:*,sched:*,irq:* trace_instance=backup=boot_map' As you can see, this just make a copy of entire reserved area and make a backup instance on it. So you can release (or shrink) the backup instance after use it to save the memory usage. /sys/kernel/tracing/instances # free total used free shared buff/cache available Mem: 1999284 55704 1930520 10132 13060 1914628 Swap: 0 0 0 /sys/kernel/tracing/instances # rmdir backup/ /sys/kernel/tracing/instances # free total used free shared buff/cache available Mem: 1999284 40640 1945584 10132 13060 1929692 Swap: 0 0 0 Note: since there is no reason to make a copy of empty buffer, this backup only accepts a persistent ring buffer as the original instance. Also, since this backup is based on vmalloc(), it does not support user-space mmap(). Cc: Mathieu Desnoyers Link: https://patch.msgid.link/176377150002.219692.9425536150438129267.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 63 +++++++++++++++++++++++++++++++++++++++----- kernel/trace/trace.h | 1 + 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 032bdedca5d9..73f8b79f1b0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9004,8 +9004,8 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; - /* A memmap'ed buffer is not supported for user space mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) + /* A memmap'ed and backup buffers are not supported for user space mmap */ + if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC)) return -ENODEV; ret = get_snapshot_map(iter->tr); @@ -10520,6 +10520,8 @@ static int __remove_instance(struct trace_array *tr) reserve_mem_release_by_name(tr->range_name); kfree(tr->range_name); } + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + vfree((void *)tr->range_addr_start); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); @@ -11325,6 +11327,42 @@ __init static void do_allocate_snapshot(const char *name) static inline void do_allocate_snapshot(const char *name) { } #endif +__init static int backup_instance_area(const char *backup, + unsigned long *addr, phys_addr_t *size) +{ + struct trace_array *backup_tr; + void *allocated_vaddr = NULL; + + backup_tr = trace_array_get_by_name(backup, NULL); + if (!backup_tr) { + pr_warn("Tracing: Instance %s is not found.\n", backup); + return -ENOENT; + } + + if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) { + pr_warn("Tracing: Instance %s is not boot mapped.\n", backup); + trace_array_put(backup_tr); + return -EINVAL; + } + + *size = backup_tr->range_addr_size; + + allocated_vaddr = vzalloc(*size); + if (!allocated_vaddr) { + pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n", + backup, (unsigned long)*size); + trace_array_put(backup_tr); + return -ENOMEM; + } + + memcpy(allocated_vaddr, + (void *)backup_tr->range_addr_start, (size_t)*size); + *addr = (unsigned long)allocated_vaddr; + + trace_array_put(backup_tr); + return 0; +} + __init static void enable_instances(void) { struct trace_array *tr; @@ -11347,11 +11385,15 @@ __init static void enable_instances(void) char *flag_delim; char *addr_delim; char *rname __free(kfree) = NULL; + char *backup; tok = strsep(&curr_str, ","); - flag_delim = strchr(tok, '^'); - addr_delim = strchr(tok, '@'); + name = strsep(&tok, "="); + backup = tok; + + flag_delim = strchr(name, '^'); + addr_delim = strchr(name, '@'); if (addr_delim) *addr_delim++ = '\0'; @@ -11359,7 +11401,10 @@ __init static void enable_instances(void) if (flag_delim) *flag_delim++ = '\0'; - name = tok; + if (backup) { + if (backup_instance_area(backup, &addr, &size) < 0) + continue; + } if (flag_delim) { char *flag; @@ -11455,7 +11500,13 @@ __init static void enable_instances(void) tr->ref++; } - if (start) { + /* + * Backup buffers can be freed but need vfree(). + */ + if (backup) + tr->flags |= TRACE_ARRAY_FL_VMALLOC; + + if (start || backup) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a3aa225ed50a..666f9a2c189d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -454,6 +454,7 @@ enum { TRACE_ARRAY_FL_LAST_BOOT = BIT(2), TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), + TRACE_ARRAY_FL_VMALLOC = BIT(5), }; #ifdef CONFIG_MODULES From f83ac7544fbf7ba3f77c122e16ab5319f75bbdfd Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 25 Nov 2025 17:34:25 +0800 Subject: [PATCH 41/42] function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously Currently, the funcgraph-args and funcgraph-retaddr features are mutually exclusive. This patch resolves this limitation by allowing funcgraph-retaddr to have an args array. To verify the change, use perf to trace vfs_write with both options enabled: Before: # perf ftrace -G vfs_write --graph-opts args,retaddr ...... down_read() { /* <-n_tty_write+0xa3/0x540 */ __cond_resched(); /* <-down_read+0x12/0x160 */ preempt_count_add(); /* <-down_read+0x3b/0x160 */ preempt_count_sub(); /* <-down_read+0x8b/0x160 */ } After: # perf ftrace -G vfs_write --graph-opts args,retaddr ...... down_read(sem=0xffff8880100bea78) { /* <-n_tty_write+0xa3/0x540 */ __cond_resched(); /* <-down_read+0x12/0x160 */ preempt_count_add(val=1); /* <-down_read+0x3b/0x160 */ preempt_count_sub(val=1); /* <-down_read+0x8b/0x160 */ } Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Masami Hiramatsu Cc: Xiaoqin Zhang Link: https://patch.msgid.link/20251125093425.2563849-1-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 7 +-- kernel/trace/trace.h | 24 +++++++++- kernel/trace/trace_entries.h | 15 +++--- kernel/trace/trace_functions_graph.c | 71 ++++++++++++++++++---------- 4 files changed, 80 insertions(+), 37 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7ded7df6e9b5..6ca9c6229d93 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1126,17 +1126,14 @@ static inline void ftrace_init(void) { } */ struct ftrace_graph_ent { unsigned long func; /* Current function */ - int depth; + unsigned long depth; } __packed; /* * Structure that defines an entry function trace with retaddr. - * It's already packed but the attribute "packed" is needed - * to remove extra padding at the end. */ struct fgraph_retaddr_ent { - unsigned long func; /* Current function */ - int depth; + struct ftrace_graph_ent ent; unsigned long retaddr; /* Return address */ } __packed; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 666f9a2c189d..c2b61bcd912f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -964,7 +964,8 @@ extern int __trace_graph_entry(struct trace_array *tr, extern int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr); + unsigned long retaddr, + struct ftrace_regs *fregs); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx, @@ -2276,4 +2277,25 @@ static inline int rv_init_interface(void) */ #define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) +/* + * This is used to get the address of the args array based on + * the type of the entry. + */ +#define FGRAPH_ENTRY_ARGS(e) \ + ({ \ + unsigned long *_args; \ + struct ftrace_graph_ent_entry *_e = e; \ + \ + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && \ + e->ent.type == TRACE_GRAPH_RETADDR_ENT) { \ + struct fgraph_retaddr_ent_entry *_re; \ + \ + _re = (typeof(_re))_e; \ + _args = _re->args; \ + } else { \ + _args = _e->args; \ + } \ + _args; \ + }) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index de294ae2c5c5..f6a8d29c0d76 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR @@ -95,13 +95,14 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, TRACE_GRAPH_RETADDR_ENT, F_STRUCT( - __field_struct( struct fgraph_retaddr_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned int, graph_ent, depth ) - __field_packed( unsigned long, graph_ent, retaddr ) + __field_struct( struct fgraph_retaddr_ent, graph_rent ) + __field_packed( unsigned long, graph_rent.ent, func ) + __field_packed( unsigned long, graph_rent.ent, depth ) + __field_packed( unsigned long, graph_rent, retaddr ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%lu) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d0513cfcd936..17c75cf2348e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -36,14 +36,19 @@ struct fgraph_ent_args { unsigned long args[FTRACE_REGS_MAX_ARGS]; }; +struct fgraph_retaddr_ent_args { + struct fgraph_retaddr_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { struct fgraph_ent_args ent; - /* TODO allow retaddr to have args */ - struct fgraph_retaddr_ent_entry rent; + struct fgraph_retaddr_ent_args rent; }; struct ftrace_graph_ret_entry ret; int failed; @@ -160,20 +165,32 @@ int __trace_graph_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct fgraph_retaddr_ent_entry *entry; + int size; + + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, - sizeof(*entry), trace_ctx); + size, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); - entry->graph_ent.func = trace->func; - entry->graph_ent.depth = trace->depth; - entry->graph_ent.retaddr = retaddr; + entry->graph_rent.ent = *trace; + entry->graph_rent.retaddr = retaddr; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; @@ -182,7 +199,8 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx, - unsigned long retaddr) + unsigned long retaddr, + struct ftrace_regs *fregs) { return 1; } @@ -267,7 +285,8 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && tracer_flags_is_set(tr, TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, + retaddr, fregs); } else { ret = __graph_entry(tr, trace, trace_ctx, fregs); } @@ -654,13 +673,9 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) { - data->rent = *(struct fgraph_retaddr_ent_entry *)curr; - } else { - int size = min((int)sizeof(data->ent), (int)iter->ent_size); + int size = min_t(int, sizeof(data->rent), iter->ent_size); - memcpy(&data->ent, curr, size); - } + memcpy(&data->rent, curr, size); /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -838,7 +853,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e trace_seq_puts(s, " /*"); trace_seq_puts(s, " <-"); - seq_print_ip_sym_offset(s, entry->graph_ent.retaddr, trace_flags); + seq_print_ip_sym_offset(s, entry->graph_rent.retaddr, trace_flags); if (comment) trace_seq_puts(s, " */"); @@ -984,7 +999,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps", (void *)ret_func); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { - print_function_args(s, entry->args, ret_func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), ret_func); trace_seq_putc(s, ';'); } else trace_seq_puts(s, "();"); @@ -1036,7 +1051,7 @@ print_graph_entry_nested(struct trace_iterator *iter, args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) - print_function_args(s, entry->args, func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), func); else trace_seq_puts(s, "()"); @@ -1218,11 +1233,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, /* * print_graph_entry() may consume the current event, * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. + * This function is shared by ftrace_graph_ent_entry and + * fgraph_retaddr_ent_entry, the size of the latter one + * is larger, but it is very small and can be safely saved + * at the stack. */ struct ftrace_graph_ent_entry *entry; - u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + struct fgraph_retaddr_ent_entry *rentry; + u8 save_buf[sizeof(*rentry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; /* The ent_size is expected to be as big as the entry */ if (iter->ent_size > sizeof(save_buf)) @@ -1451,12 +1469,17 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { - struct fgraph_retaddr_ent_entry saved; + /* + * ftrace_graph_ent_entry and fgraph_retaddr_ent_entry have + * similar functions and memory layouts. The only difference + * is that the latter one has an extra retaddr member, so + * they can share most of the logic. + */ struct fgraph_retaddr_ent_entry *rfield; trace_assign_type(rfield, entry); - saved = *rfield; - return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + return print_graph_entry((struct ftrace_graph_ent_entry *)rfield, + s, iter, flags); } #endif case TRACE_GRAPH_RET: { From f6ed9c5d3190cf18382ee75e0420602101f53586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2025 14:52:49 -0500 Subject: [PATCH 42/42] overflow: Introduce struct_offset() to get offset of member The trace_marker_raw file in tracefs takes a buffer from user space that contains an id as well as a raw data string which is usually a binary structure. The structure used has the following: struct raw_data_entry { struct trace_entry ent; unsigned int id; char buf[]; }; Since the passed in "cnt" variable is both the size of buf as well as the size of id, the code to allocate the location on the ring buffer had: size = struct_size(entry, buf, cnt - sizeof(entry->id)); Which is quite ugly and hard to understand. Instead, add a helper macro called struct_offset() which then changes the above to a simple and easy to understand: size = struct_offset(entry, id) + cnt; This will likely come in handy for other use cases too. Link: https://lore.kernel.org/all/CAHk-=whYZVoEdfO1PmtbirPdBMTV9Nxt9f09CK0k6S+HJD3Zmg@mail.gmail.com/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: "Gustavo A. R. Silva" Link: https://patch.msgid.link/20251126145249.05b1770a@gandalf.local.home Suggested-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/overflow.h | 12 ++++++++++++ kernel/trace/trace.c | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 725f95f7e416..736f633b2d5f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -458,6 +458,18 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) +/** + * struct_offset() - Calculate the offset of a member within a struct + * @p: Pointer to the struct + * @member: Name of the member to get the offset of + * + * Calculates the offset of a particular @member of the structure pointed + * to by @p. + * + * Return: number of bytes to the location of @member. + */ +#define struct_offset(p, member) (offsetof(typeof(*(p)), member)) + /** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73f8b79f1b0c..3d433a426e5f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7642,7 +7642,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, size_t size; /* cnt includes both the entry->id and the data behind it. */ - size = struct_size(entry, buf, cnt - sizeof(entry->id)); + size = struct_offset(entry, id) + cnt; buffer = tr->array_buffer.buffer;