From 0de4c70d04a46a3c266547dd4275ce25f623796a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: [PATCH 01/12] tracing: fprobe: use rhltable for fprobe_ip_table For now, all the kernel functions who are hooked by the fprobe will be added to the hash table "fprobe_ip_table". The key of it is the function address, and the value of it is "struct fprobe_hlist_node". The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And this means the overhead of the hash table lookup will grow linearly if the count of the functions in the fprobe more than 256. When we try to hook all the kernel functions, the overhead will be huge. Therefore, replace the hash table with rhltable to reduce the overhead. Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 3 +- kernel/trace/fprobe.c | 159 ++++++++++++++++++++++++----------------- 2 files changed, 94 insertions(+), 68 deletions(-) diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 7964db96e41a..0a3bcd1718f3 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct fprobe; @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, * @fp: The fprobe which owns this. */ struct fprobe_hlist_node { - struct hlist_node hlist; + struct rhlist_head hlist; unsigned long addr; struct fprobe *fp; }; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 5a807d62e76d..e063e22e1134 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -41,60 +42,67 @@ * - RCU hlist traversal under disabling preempt */ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); -/* - * Find first fprobe in the hlist. It will be iterated twice in the entry - * probe, once for correcting the total required size, the second time is - * calling back the user handlers. - * Thus the hlist in the fprobe_table must be sorted and new probe needs to - * be added *before* the first fprobe. - */ -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { - struct fprobe_hlist_node *node; - struct hlist_head *head; - - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (node->addr == ip) - return node; - } - return NULL; + return hash_ptr(*(unsigned long **)data, 32); } -NOKPROBE_SYMBOL(find_first_fprobe_node); + +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; +} + +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} + +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, +}; /* Node insertion and deletion requires the fprobe_mutex */ -static void insert_fprobe_node(struct fprobe_hlist_node *node) +static int insert_fprobe_node(struct fprobe_hlist_node *node) { - unsigned long ip = node->addr; - struct fprobe_hlist_node *next; - struct hlist_head *head; - lockdep_assert_held(&fprobe_mutex); - next = find_first_fprobe_node(ip); - if (next) { - hlist_add_before_rcu(&node->hlist, &next->hlist); - return; - } - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_add_head_rcu(&node->hlist, head); + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } /* Return true if there are synonims */ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); + bool ret; /* Avoid double deleting */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } - return !!find_first_fprobe_node(node->addr); + + rcu_read_lock(); + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, + fprobe_rht_params); + rcu_read_unlock(); + + return ret; } /* Check existence of the fprobe */ @@ -249,9 +257,10 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) { - struct fprobe_hlist_node *node, *first; unsigned long *fgraph_data = NULL; unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; unsigned long ret_ip; int reserved_words; struct fprobe *fp; @@ -260,14 +269,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; - first = node = find_first_fprobe_node(func); - if (unlikely(!first)) - return 0; - + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || !fp->exit_handler) continue; @@ -278,13 +284,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, reserved_words += FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); } - node = first; if (reserved_words) { fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); if (unlikely(!fgraph_data)) { - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (fp && !fprobe_disabled(fp)) fp->nmissed++; @@ -299,12 +304,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, */ ret_ip = ftrace_regs_get_return_address(fregs); used = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { int data_size; void *data; if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || fprobe_disabled(fp)) continue; @@ -449,25 +454,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad return 0; } -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, - struct fprobe_addr_list *alist) +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (!within_module(node->addr, mod)) - continue; - if (delete_fprobe_node(node)) - continue; - /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. - */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); - } + if (!within_module(node->addr, mod)) + return; + if (delete_fprobe_node(node)) + return; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ @@ -475,8 +476,9 @@ static int fprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; struct module *mod = data; - int i; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; @@ -487,8 +489,16 @@ static int fprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; mutex_lock(&fprobe_mutex); - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + fprobe_remove_node_in_module(mod, node, &alist); + + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, @@ -728,8 +738,16 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) ret = fprobe_graph_add_ips(addrs, num); if (!ret) { add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) - insert_fprobe_node(&hlist_array->array[i]); + for (i = 0; i < hlist_array->size; i++) { + ret = insert_fprobe_node(&hlist_array->array[i]); + if (ret) + break; + } + /* fallback on insert error */ + if (ret) { + for (i--; i >= 0; i--) + delete_fprobe_node(&hlist_array->array[i]); + } } mutex_unlock(&fprobe_mutex); @@ -825,3 +843,10 @@ int unregister_fprobe(struct fprobe *fp) return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +late_initcall(fprobe_initcall); From f959ecdfcb6b1c4d01f380c35ed0c0d99fc1ced9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: [PATCH 02/12] tracing: probes: Use __free() for trace_probe_log Use __free() for trace_probe_log_clear() to cleanup error log interface. Link: https://lore.kernel.org/all/175509538609.193596.16646724647358218778.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 5 ++--- kernel/trace/trace_probe.c | 3 ++- kernel/trace/trace_probe.h | 4 +++- kernel/trace/trace_uprobe.c | 6 ++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a1d402124836..aaba765d54cf 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -874,6 +874,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * =$[:TYPE] */ + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; @@ -887,7 +888,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; - trace_probe_log_init("event_probe", argc, argv); + trlog = trace_probe_log_init("event_probe", argc, argv); event = strchr(&argv[0][1], ':'); if (event) { @@ -987,7 +988,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } - trace_probe_log_clear(); return ret; mem_error: @@ -996,7 +996,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) parse_error: ret = -EINVAL; error: - trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5cbdc423afeb..5b92376a58fc 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -156,7 +156,7 @@ static const struct fetch_type *find_fetch_type(const char *type, unsigned long static struct trace_probe_log trace_probe_log; extern struct mutex dyn_event_ops_mutex; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv) { lockdep_assert_held(&dyn_event_ops_mutex); @@ -164,6 +164,7 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) trace_probe_log.argc = argc; trace_probe_log.argv = argv; trace_probe_log.index = 0; + return subsystem; } void trace_probe_log_clear(void) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 08b5bda24da2..9fc56c937130 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -578,11 +578,13 @@ struct trace_probe_log { int index; }; -void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +const char *trace_probe_log_init(const char *subsystem, int argc, const char **argv); void trace_probe_log_set_index(int index); void trace_probe_log_clear(void); void __trace_probe_log_err(int offset, int err); +DEFINE_FREE(trace_probe_log_clear, const char *, if (_T) trace_probe_log_clear()) + #define trace_probe_log_err(offs, err) \ __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 430d09c49462..b3d39b5b0690 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,6 +539,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) */ static int __trace_uprobe_create(int argc, const char **argv) { + const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; unsigned long offset, ref_ctr_offset; @@ -565,7 +566,7 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_uprobe", argc, argv); + trlog = trace_probe_log_init("trace_uprobe", argc, argv); if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); @@ -597,7 +598,6 @@ static int __trace_uprobe_create(int argc, const char **argv) if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); - trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { @@ -728,14 +728,12 @@ static int __trace_uprobe_create(int argc, const char **argv) error: free_trace_uprobe(tu); out: - trace_probe_log_clear(); return ret; fail_mem: ret = -ENOMEM; fail_address_parse: - trace_probe_log_clear(); path_put(&path); kfree(filename); From 0d6edbc9a41557621bb68829bdbdd551bd927961 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: [PATCH 03/12] tracing: eprobe: Cleanup eprobe event using __free() Use __free(trace_event_probe_cleanup) to remove unneeded gotos and cleanup the last part of trace_eprobe_parse_filter(). Link: https://lore.kernel.org/all/175509539571.193596.4674012182718751429.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 71 ++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index aaba765d54cf..f7a1ff509d7e 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -61,6 +61,9 @@ static void trace_event_probe_cleanup(struct trace_eprobe *ep) kfree(ep); } +DEFINE_FREE(trace_event_probe_cleanup, struct trace_eprobe *, + if (!IS_ERR_OR_NULL(_T)) trace_event_probe_cleanup(_T)) + static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) { return container_of(ev, struct trace_eprobe, devent); @@ -197,10 +200,10 @@ static struct trace_eprobe *alloc_event_probe(const char *group, struct trace_event_call *event, int nargs) { - struct trace_eprobe *ep; + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *event_name; const char *sys_name; - int ret = -ENOMEM; + int ret; if (!event) return ERR_PTR(-ENODEV); @@ -211,25 +214,22 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { trace_event_put_ref(event); - goto error; + return ERR_PTR(-ENOMEM); } ep->event = event; ep->event_name = kstrdup(event_name, GFP_KERNEL); if (!ep->event_name) - goto error; + return ERR_PTR(-ENOMEM); ep->event_system = kstrdup(sys_name, GFP_KERNEL); if (!ep->event_system) - goto error; + return ERR_PTR(-ENOMEM); ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); - return ep; -error: - trace_event_probe_cleanup(ep); - return ERR_PTR(ret); + return_ptr(ep); } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -856,13 +856,10 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, true, &dummy); free_event_filter(dummy); - if (ret) - goto error; - - return 0; -error: - kfree(ep->filter_str); - ep->filter_str = NULL; + if (ret) { + kfree(ep->filter_str); + ep->filter_str = NULL; + } return ret; } @@ -874,6 +871,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * =$[:TYPE] */ + struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; @@ -881,7 +879,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) char *buf1 __free(kfree) = NULL; char *buf2 __free(kfree) = NULL; char *gbuf __free(kfree) = NULL; - struct trace_eprobe *ep = NULL; int ret = 0, filter_idx = 0; int i, filter_cnt; @@ -894,12 +891,12 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (event) { gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!gbuf) - goto mem_error; + return -ENOMEM; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } trace_probe_log_set_index(1); @@ -907,18 +904,18 @@ static int __trace_eprobe_create(int argc, const char *argv[]) buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!buf2) - goto mem_error; + return -ENOMEM; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); - goto parse_error; + return -EINVAL; } if (!event) { buf1 = kstrdup(sys_event, GFP_KERNEL); if (!buf1) - goto mem_error; + return -ENOMEM; event = buf1; } @@ -934,8 +931,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (argc - 2 > MAX_TRACE_ARGS) { trace_probe_log_set_index(2); trace_probe_log_err(0, TOO_MANY_ARGS); - ret = -E2BIG; - goto error; + return -E2BIG; } scoped_guard(mutex, &event_mutex) { @@ -949,15 +945,14 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); - ep = NULL; - goto error; + return ret; } if (filter_idx) { trace_probe_log_set_index(filter_idx); ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); if (ret) - goto parse_error; + return -EINVAL; } else ep->filter_str = NULL; @@ -967,11 +962,12 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(i + 2); ret = trace_eprobe_tp_update_arg(ep, argv, i); if (ret) - goto error; + return ret; } ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); if (ret < 0) - goto error; + return ret; + init_trace_eprobe_call(ep); scoped_guard(mutex, &event_mutex) { ret = trace_probe_register_event_call(&ep->tp); @@ -980,24 +976,17 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); trace_probe_log_err(0, EVENT_EXIST); } - goto error; + return ret; } ret = dyn_event_add(&ep->devent, &ep->tp.event->call); if (ret < 0) { trace_probe_unregister_event_call(&ep->tp); - goto error; + return ret; } + /* To avoid freeing registered eprobe event, clear ep. */ + ep = NULL; } return ret; - -mem_error: - ret = -ENOMEM; - goto error; -parse_error: - ret = -EINVAL; -error: - trace_event_probe_cleanup(ep); - return ret; } /* From 8b658df206586f85cf19097068660203fa1253bd Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: [PATCH 04/12] tracing: uprobes: Cleanup __trace_uprobe_create() with __free() Use __free() to cleanup ugly gotos in __trace_uprobe_create(). Link: https://lore.kernel.org/all/175509540482.193596.6541098946023873304.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 68 ++++++++++++++----------------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b3d39b5b0690..d1d9abbe6248 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -533,22 +533,25 @@ static int register_trace_uprobe(struct trace_uprobe *tu) return ret; } +DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe(_T)) + /* * Argument syntax: * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { + struct trace_uprobe *tu __free(free_trace_uprobe) = NULL; const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; - char *arg, *filename, *rctr, *rctr_end, *tmp; + struct path path __free(path_put) = {}; unsigned long offset, ref_ctr_offset; + char *filename __free(kfree) = NULL; + char *arg, *rctr, *rctr_end, *tmp; char *gbuf __free(kfree) = NULL; char *buf __free(kfree) = NULL; enum probe_print_type ptype; - struct trace_uprobe *tu; bool is_return = false; - struct path path; int i, ret; ref_ctr_offset = 0; @@ -586,10 +589,8 @@ static int __trace_uprobe_create(int argc, const char **argv) /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(filename, ':'); - if (!arg || !isdigit(arg[1])) { - kfree(filename); + if (!arg || !isdigit(arg[1])) return -ECANCELED; - } trace_probe_log_set_index(1); /* filename is the 2nd argument */ @@ -597,13 +598,11 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { trace_probe_log_err(0, FILE_NOT_FOUND); - kfree(filename); return ret; } if (!d_is_reg(path.dentry)) { trace_probe_log_err(0, NO_REGULAR_FILE); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } /* Parse reference counter offset if specified. */ @@ -611,16 +610,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (rctr) { rctr_end = strchr(rctr, ')'); if (!rctr_end) { - ret = -EINVAL; rctr_end = rctr + strlen(rctr); trace_probe_log_err(rctr_end - filename, REFCNT_OPEN_BRACE); - goto fail_address_parse; + return -EINVAL; } else if (rctr_end[1] != '\0') { - ret = -EINVAL; trace_probe_log_err(rctr_end + 1 - filename, BAD_REFCNT_SUFFIX); - goto fail_address_parse; + return -EINVAL; } *rctr++ = '\0'; @@ -628,7 +625,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { trace_probe_log_err(rctr - filename, BAD_REFCNT); - goto fail_address_parse; + return ret; } } @@ -640,8 +637,7 @@ static int __trace_uprobe_create(int argc, const char **argv) is_return = true; } else { trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); - ret = -EINVAL; - goto fail_address_parse; + return -EINVAL; } } @@ -649,7 +645,7 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = kstrtoul(arg, 0, &offset); if (ret) { trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); - goto fail_address_parse; + return ret; } /* setup a probe */ @@ -657,12 +653,12 @@ static int __trace_uprobe_create(int argc, const char **argv) if (event) { gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!gbuf) - goto fail_mem; + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto fail_address_parse; + return ret; } if (!event) { @@ -671,7 +667,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tail = kstrdup(kbasename(filename), GFP_KERNEL); if (!tail) - goto fail_mem; + return -ENOMEM; ptr = strpbrk(tail, ".-_"); if (ptr) @@ -679,7 +675,7 @@ static int __trace_uprobe_create(int argc, const char **argv) buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); if (!buf) - goto fail_mem; + return -ENOMEM; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -693,49 +689,37 @@ static int __trace_uprobe_create(int argc, const char **argv) ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto fail_address_parse; + return ret; } tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = filename; + /* Clear @path so that it will not freed by path_put() */ + memset(&path, 0, sizeof(path)); + tu->filename = no_free_ptr(filename); /* parse arguments */ for (i = 0; i < argc; i++) { struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - ret = -ENOMEM; - goto error; - } + if (!ctx) + return -ENOMEM; ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) - goto error; + return ret; } ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_uprobe(tu); if (!ret) - goto out; - -error: - free_trace_uprobe(tu); -out: - return ret; - -fail_mem: - ret = -ENOMEM; - -fail_address_parse: - path_put(&path); - kfree(filename); + tu = NULL; return ret; } From 84404ce71a4b56ab979eb739dd94cf66b049b112 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 25 Sep 2025 09:56:48 +0900 Subject: [PATCH 05/12] tracing: uprobe: eprobes: Allocate traceprobe_parse_context per probe Since traceprobe_parse_context is reusable among a probe arguments, it is more efficient to allocate it outside of the loop for parsing probe argument as kprobe and fprobe events do. Link: https://lore.kernel.org/all/175509541393.193596.16330324746701582114.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 32 ++++++++++++-------------------- kernel/trace/trace_uprobe.c | 12 ++++++------ 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index f7a1ff509d7e..d58d8702a327 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -801,25 +801,6 @@ find_and_get_event(const char *system, const char *event_name) return NULL; } -static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) -{ - struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; - int ret; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - ctx->event = ep->event; - ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; - - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); - /* Handle symbols "@" */ - if (!ret) - ret = traceprobe_update_arg(&ep->tp.args[i]); - - return ret; -} - static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) { struct event_filter *dummy = NULL; @@ -871,6 +852,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) * Fetch args (no space): * =$[:TYPE] */ + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; struct trace_eprobe *ep __free(trace_event_probe_cleanup) = NULL; const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; @@ -956,11 +938,21 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } else ep->filter_str = NULL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ret = trace_eprobe_tp_update_arg(ep, argv, i); + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); if (ret) return ret; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d1d9abbe6248..1b4f32e2b9bd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -541,6 +541,7 @@ DEFINE_FREE(free_trace_uprobe, struct trace_uprobe *, if (_T) free_trace_uprobe( */ static int __trace_uprobe_create(int argc, const char **argv) { + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; struct trace_uprobe *tu __free(free_trace_uprobe) = NULL; const char *trlog __free(trace_probe_log_clear) = NULL; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; @@ -698,14 +699,13 @@ static int __trace_uprobe_create(int argc, const char **argv) memset(&path, 0, sizeof(path)); tu->filename = no_free_ptr(filename); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; + /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) - = kzalloc(sizeof(*ctx), GFP_KERNEL); - - if (!ctx) - return -ENOMEM; - ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) From ceb5d8d367d6d37a220023df443d8b67c2f4d1cd Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 25 Sep 2025 09:56:48 +0900 Subject: [PATCH 06/12] tracing: fprobe: fix suspicious rcu usage in fprobe_entry rcu_read_lock() is not needed in fprobe_entry, but rcu_dereference_check() is used in rhltable_lookup(), which causes suspicious RCU usage warning: WARNING: suspicious RCU usage 6.17.0-rc1-00001-gdfe0d675df82 #1 Tainted: G S ----------------------------- include/linux/rhashtable.h:602 suspicious rcu_dereference_check() usage! ...... stack backtrace: CPU: 1 UID: 0 PID: 4652 Comm: ftracetest Tainted: G S Tainted: [S]=CPU_OUT_OF_SPEC, [I]=FIRMWARE_WORKAROUND Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.1.1 10/07/2015 Call Trace: dump_stack_lvl+0x7c/0x90 lockdep_rcu_suspicious+0x14f/0x1c0 __rhashtable_lookup+0x1e0/0x260 ? __pfx_kernel_clone+0x10/0x10 fprobe_entry+0x9a/0x450 ? __lock_acquire+0x6b0/0xca0 ? find_held_lock+0x2b/0x80 ? __pfx_fprobe_entry+0x10/0x10 ? __pfx_kernel_clone+0x10/0x10 ? lock_acquire+0x14c/0x2d0 ? __might_fault+0x74/0xc0 function_graph_enter_regs+0x2a0/0x550 ? __do_sys_clone+0xb5/0x100 ? __pfx_function_graph_enter_regs+0x10/0x10 ? _copy_to_user+0x58/0x70 ? __pfx_kernel_clone+0x10/0x10 ? __x64_sys_rt_sigprocmask+0x114/0x180 ? __pfx___x64_sys_rt_sigprocmask+0x10/0x10 ? __pfx_kernel_clone+0x10/0x10 ftrace_graph_func+0x87/0xb0 As we discussed in [1], fix this by using guard(rcu)() in fprobe_entry() to protect the rhltable_lookup() and rhl_for_each_entry_rcu() with rcu_read_lock and suppress this warning. Link: https://lore.kernel.org/all/20250904062729.151931-1-dongml2@chinatelecom.cn/ Link: https://lore.kernel.org/all/20250829021436.19982-1-dongml2@chinatelecom.cn/ [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508281655.54c87330-lkp@intel.com Fixes: dfe0d675df82 ("tracing: fprobe: use rhltable for fprobe_ip_table") Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index e063e22e1134..5742ace9b4b5 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -269,6 +269,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; + guard(rcu)(); head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; rhl_for_each_entry_rcu(node, pos, head, hlist) { From cbe1e1241a4d5d7cfc8dbef0fc47ca2d21c28cb2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 25 Sep 2025 09:56:49 +0900 Subject: [PATCH 07/12] tracing: probes: Replace strcpy() with memcpy() in __trace_probe_log_err() strcpy() is deprecated; use memcpy() instead. Link: https://lore.kernel.org/all/20250820214717.778243-3-thorsten.blum@linux.dev/ Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5b92376a58fc..bb67f6a2136c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -215,7 +215,7 @@ void __trace_probe_log_err(int offset, int err_type) p = command; for (i = 0; i < trace_probe_log.argc; i++) { len = strlen(trace_probe_log.argv[i]); - strcpy(p, trace_probe_log.argv[i]); + memcpy(p, trace_probe_log.argv[i], len); p[len] = ' '; p += len + 1; } From 90e69d291d195d35215b578d210fd3ce0e5a3f42 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 25 Sep 2025 09:56:49 +0900 Subject: [PATCH 08/12] tracing: fprobe: Remove unused local variable The 'ret' local variable in fprobe_remove_node_in_module() was used for checking the error state in the loop, but commit dfe0d675df82 ("tracing: fprobe: use rhltable for fprobe_ip_table") removed the loop. So we don't need it anymore. Link: https://lore.kernel.org/all/175867358989.600222.6175459620045800878.stgit@devnote2/ Fixes: e5a4cc28a052 ("tracing: fprobe: use rhltable for fprobe_ip_table") Signed-off-by: Masami Hiramatsu (Google) Acked-by: Menglong Dong --- kernel/trace/fprobe.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 5742ace9b4b5..95e43814b85b 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -458,8 +458,6 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, struct fprobe_addr_list *alist) { - int ret = 0; - if (!within_module(node->addr, mod)) return; if (delete_fprobe_node(node)) @@ -468,8 +466,7 @@ static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist * If failed to update alist, just continue to update hlist. * Therefore, at list user handler will not hit anymore. */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); + fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ From e667152e0064acf1a308a1816719008e29bec76f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Oct 2025 17:39:04 +0900 Subject: [PATCH 09/12] tracing: fprobe: Fix to init fprobe_ip_table earlier Since the fprobe_ip_table is used from module unloading in the failure path of load_module(), it must be initialized in the earlier timing than late_initcall(). Unless that, the fprobe_module_callback() will use an uninitialized spinlock of fprobe_ip_table. Initialize fprobe_ip_table in core_initcall which is the same timing as ftrace. Link: https://lore.kernel.org/all/175939434403.3665022.13030530757238556332.stgit@mhiramat.tok.corp.google.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509301440.be4b3631-lkp@intel.com Fixes: e5a4cc28a052 ("tracing: fprobe: use rhltable for fprobe_ip_table") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Menglong Dong --- kernel/trace/fprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95e43814b85b..99d83c08b9e2 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -847,4 +847,4 @@ static int __init fprobe_initcall(void) rhltable_init(&fprobe_ip_table, &fprobe_rht_params); return 0; } -late_initcall(fprobe_initcall); +core_initcall(fprobe_initcall); From 2c67dc457bc67367dc8fcd8f471ce2d5bb5f7b2b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 15 Oct 2025 16:32:37 +0800 Subject: [PATCH 10/12] tracing: fprobe: optimization for entry only case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For now, fgraph is used for the fprobe, even if we need trace the entry only. However, the performance of ftrace is better than fgraph, and we can use ftrace_ops for this case. Then performance of kprobe-multi increases from 54M to 69M. Before this commit: $ ./benchs/run_bench_trigger.sh kprobe-multi kprobe-multi : 54.663 ± 0.493M/s After this commit: $ ./benchs/run_bench_trigger.sh kprobe-multi kprobe-multi : 69.447 ± 0.143M/s Mitigation is disable during the bench testing above. Link: https://lore.kernel.org/all/20251015083238.2374294-2-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 128 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 119 insertions(+), 9 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 99d83c08b9e2..ecd623eef68b 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -254,8 +254,106 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } -static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, - struct ftrace_regs *fregs) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +/* ftrace_ops callback, this processes fprobes which have only entry_handler. */ +static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; + struct fprobe *fp; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + /* + * ftrace_test_recursion_trylock() disables preemption, but + * rhltable_lookup() checks whether rcu_read_lcok is held. + * So we take rcu_read_lock() here. + */ + rcu_read_lock(); + head = rhltable_lookup(&fprobe_ip_table, &ip, fprobe_rht_params); + + rhl_for_each_entry_rcu(node, pos, head, hlist) { + if (node->addr != ip) + break; + fp = READ_ONCE(node->fp); + if (unlikely(!fp || fprobe_disabled(fp) || fp->exit_handler)) + continue; + + if (fprobe_shared_with_kprobes(fp)) + __fprobe_kprobe_handler(ip, parent_ip, fp, fregs, NULL); + else + __fprobe_handler(ip, parent_ip, fp, fregs, NULL); + } + rcu_read_unlock(); + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_ftrace_entry); + +static struct ftrace_ops fprobe_ftrace_ops = { + .func = fprobe_ftrace_entry, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static int fprobe_ftrace_active; + +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_ftrace_active) { + ret = register_ftrace_function(&fprobe_ftrace_ops); + if (ret) { + ftrace_free_filter(&fprobe_ftrace_ops); + return ret; + } + } + fprobe_ftrace_active++; + return 0; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_ftrace_active--; + if (!fprobe_ftrace_active) + unregister_ftrace_function(&fprobe_ftrace_ops); + if (num) + ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0); +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return !fp->exit_handler; +} +#else +static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) +{ + return -ENOENT; +} + +static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num) +{ +} + +static bool fprobe_is_ftrace(struct fprobe *fp) +{ + return false; +} +#endif + +/* fgraph_ops callback, this processes fprobes which have exit_handler. */ +static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *fgraph_data = NULL; unsigned long func = trace->func; @@ -292,7 +390,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (node->addr != func) continue; fp = READ_ONCE(node->fp); - if (fp && !fprobe_disabled(fp)) + if (fp && !fprobe_disabled(fp) && !fprobe_is_ftrace(fp)) fp->nmissed++; } return 0; @@ -312,7 +410,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (node->addr != func) continue; fp = READ_ONCE(node->fp); - if (!fp || fprobe_disabled(fp)) + if (unlikely(!fp || fprobe_disabled(fp) || fprobe_is_ftrace(fp))) continue; data_size = fp->entry_data_size; @@ -340,7 +438,7 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, /* If any exit_handler is set, data must be used. */ return used != 0; } -NOKPROBE_SYMBOL(fprobe_entry); +NOKPROBE_SYMBOL(fprobe_fgraph_entry); static void fprobe_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, @@ -379,7 +477,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, NOKPROBE_SYMBOL(fprobe_return); static struct fgraph_ops fprobe_graph_ops = { - .entryfunc = fprobe_entry, + .entryfunc = fprobe_fgraph_entry, .retfunc = fprobe_return, }; static int fprobe_graph_active; @@ -498,9 +596,14 @@ static int fprobe_module_callback(struct notifier_block *nb, } while (node == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); - if (alist.index > 0) + if (alist.index > 0) { ftrace_set_filter_ips(&fprobe_graph_ops.ops, alist.addrs, alist.index, 1, 0); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + ftrace_set_filter_ips(&fprobe_ftrace_ops, + alist.addrs, alist.index, 1, 0); +#endif + } mutex_unlock(&fprobe_mutex); kfree(alist.addrs); @@ -733,7 +836,11 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) mutex_lock(&fprobe_mutex); hlist_array = fp->hlist_array; - ret = fprobe_graph_add_ips(addrs, num); + if (fprobe_is_ftrace(fp)) + ret = fprobe_ftrace_add_ips(addrs, num); + else + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { add_fprobe_hash(fp); for (i = 0; i < hlist_array->size; i++) { @@ -829,7 +936,10 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - fprobe_graph_remove_ips(addrs, count); + if (fprobe_is_ftrace(fp)) + fprobe_ftrace_remove_ips(addrs, count); + else + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; From 08ed5c81f6058bd7d6cd28d1750667ad3ceee3d1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 15 Oct 2025 16:32:38 +0800 Subject: [PATCH 11/12] lib/test_fprobe: add testcase for mixed fprobe Add the testcase for the fprobe, which will hook the same target with two fprobe: entry, entry+exit. And the two fprobes will be registered with different order. fgraph and ftrace are both used for the fprobe, and this testcase is for the mixed situation. Link: https://lore.kernel.org/all/20251015083238.2374294-3-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- lib/tests/test_fprobe.c | 99 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/lib/tests/test_fprobe.c b/lib/tests/test_fprobe.c index cf92111b5c79..108c7aa33cb4 100644 --- a/lib/tests/test_fprobe.c +++ b/lib/tests/test_fprobe.c @@ -12,7 +12,8 @@ static struct kunit *current_test; -static u32 rand1, entry_val, exit_val; +static u32 rand1, entry_only_val, entry_val, exit_val; +static u32 entry_only_count, entry_count, exit_count; /* Use indirect calls to avoid inlining the target functions */ static u32 (*target)(u32 value); @@ -190,6 +191,101 @@ static void test_fprobe_skip(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); } +/* Handler for fprobe entry only case */ +static notrace int entry_only_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, + struct ftrace_regs *fregs, void *data) +{ + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ip, target_ip); + + entry_only_count++; + entry_only_val = (rand1 / div_factor); + + return 0; +} + +static notrace int fprobe_entry_multi_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, + struct ftrace_regs *fregs, + void *data) +{ + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ip, target_ip); + + entry_count++; + entry_val = (rand1 / div_factor); + + return 0; +} + +static notrace void fprobe_exit_multi_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, + struct ftrace_regs *fregs, + void *data) +{ + unsigned long ret = ftrace_regs_get_return_value(fregs); + + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ip, target_ip); + KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor)); + + exit_count++; + exit_val = ret; +} + +static void check_fprobe_multi(struct kunit *test) +{ + entry_only_count = entry_count = exit_count = 0; + entry_only_val = entry_val = exit_val = 0; + + target(rand1); + + /* Verify all handlers were called */ + KUNIT_EXPECT_EQ(test, 1, entry_only_count); + KUNIT_EXPECT_EQ(test, 1, entry_count); + KUNIT_EXPECT_EQ(test, 1, exit_count); + + /* Verify values are correct */ + KUNIT_EXPECT_EQ(test, (rand1 / div_factor), entry_only_val); + KUNIT_EXPECT_EQ(test, (rand1 / div_factor), entry_val); + KUNIT_EXPECT_EQ(test, (rand1 / div_factor), exit_val); +} + +/* Test multiple fprobes hooking the same target function */ +static void test_fprobe_multi(struct kunit *test) +{ + struct fprobe fp1 = { + .entry_handler = fprobe_entry_multi_handler, + .exit_handler = fprobe_exit_multi_handler, + }; + struct fprobe fp2 = { + .entry_handler = entry_only_handler, + }; + + current_test = test; + + /* Test Case 1: Register in order 1 -> 2 */ + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp1, "fprobe_selftest_target", NULL)); + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp2, "fprobe_selftest_target", NULL)); + + check_fprobe_multi(test); + + /* Unregister all */ + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp1)); + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp2)); + + /* Test Case 2: Register in order 2 -> 1 */ + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp2, "fprobe_selftest_target", NULL)); + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp1, "fprobe_selftest_target", NULL)); + + check_fprobe_multi(test); + + /* Unregister all */ + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp1)); + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp2)); +} + static unsigned long get_ftrace_location(void *func) { unsigned long size, addr = (unsigned long)func; @@ -217,6 +313,7 @@ static struct kunit_case fprobe_testcases[] = { KUNIT_CASE(test_fprobe_syms), KUNIT_CASE(test_fprobe_data), KUNIT_CASE(test_fprobe_skip), + KUNIT_CASE(test_fprobe_multi), {} }; From cd06078a38aaedfebbf8fa0c009da0f99f4473fb Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 3 Nov 2025 14:34:34 +0800 Subject: [PATCH 12/12] tracing: fprobe: use ftrace if CONFIG_DYNAMIC_FTRACE_WITH_ARGS For now, we will use ftrace for the fprobe if fp->exit_handler not exists and CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled. However, CONFIG_DYNAMIC_FTRACE_WITH_REGS is not supported by some arch, such as arm. What we need in the fprobe is the function arguments, so we can use ftrace for fprobe if CONFIG_DYNAMIC_FTRACE_WITH_ARGS is enabled. Therefore, use ftrace if CONFIG_DYNAMIC_FTRACE_WITH_REGS or CONFIG_DYNAMIC_FTRACE_WITH_ARGS enabled. Link: https://lore.kernel.org/all/20251103063434.47388-1-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index ecd623eef68b..0b1ee8e585f2 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -44,6 +44,7 @@ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); +static struct fgraph_ops fprobe_graph_ops; static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { @@ -254,7 +255,7 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent return ret; } -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) /* ftrace_ops callback, this processes fprobes which have only entry_handler. */ static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) @@ -295,7 +296,7 @@ NOKPROBE_SYMBOL(fprobe_ftrace_entry); static struct ftrace_ops fprobe_ftrace_ops = { .func = fprobe_ftrace_entry, - .flags = FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_ARGS, }; static int fprobe_ftrace_active; @@ -335,6 +336,15 @@ static bool fprobe_is_ftrace(struct fprobe *fp) { return !fp->exit_handler; } + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); + ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset); +} +#endif #else static int fprobe_ftrace_add_ips(unsigned long *addrs, int num) { @@ -349,7 +359,15 @@ static bool fprobe_is_ftrace(struct fprobe *fp) { return false; } + +#ifdef CONFIG_MODULES +static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove, + int reset) +{ + ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset); +} #endif +#endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */ /* fgraph_ops callback, this processes fprobes which have exit_handler. */ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, @@ -596,14 +614,8 @@ static int fprobe_module_callback(struct notifier_block *nb, } while (node == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); - if (alist.index > 0) { - ftrace_set_filter_ips(&fprobe_graph_ops.ops, - alist.addrs, alist.index, 1, 0); -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - ftrace_set_filter_ips(&fprobe_ftrace_ops, - alist.addrs, alist.index, 1, 0); -#endif - } + if (alist.index > 0) + fprobe_set_ips(alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); kfree(alist.addrs);