// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include #include #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include #include #include #include "trace_output.h" #include "trace.h" static DEFINE_MUTEX(syscall_trace_lock); static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data); static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data); static struct list_head * syscall_get_enter_fields(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; return &entry->enter_fields; } extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { /* * Only compare after the "sys" prefix. Archs that use * syscall wrappers may have syscalls symbols aliases prefixed * with ".SyS" or ".sys" instead of "sys", leading to an unwanted * mismatch. */ return !strcmp(sym + 3, name + 3); } #endif #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS /* * Some architectures that allow for 32bit applications * to run on a 64bit kernel, do not map the syscalls for * the 32bit tasks the same as they do for 64bit tasks. * * *cough*x86*cough* * * In such a case, instead of reporting the wrong syscalls, * simply ignore them. * * For an arch to ignore the compat syscalls it needs to * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as * define the function arch_trace_is_compat_syscall() to let * the tracing system know that it should ignore it. */ static int trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) { if (unlikely(arch_trace_is_compat_syscall(regs))) return -1; return syscall_get_nr(task, regs); } #else static inline int trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) { return syscall_get_nr(task, regs); } #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { struct syscall_metadata **start; struct syscall_metadata **stop; char str[KSYM_SYMBOL_LEN]; start = __start_syscalls_metadata; stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) return NULL; for ( ; start < stop; start++) { if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; } static struct syscall_metadata *syscall_nr_to_meta(int nr) { if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; } const char *get_syscall_name(int syscall) { struct syscall_metadata *entry; entry = syscall_nr_to_meta(syscall); if (!entry) return NULL; return entry->name; } /* Added to user strings or arrays when max limit is reached */ #define EXTRA "..." static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, struct syscall_metadata *entry, int *offset_p, int *len_p, unsigned char **ptr_p) { unsigned char *ptr; int offset = *offset_p; int val; /* This arg points to a user space string */ ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; val = *(int *)ptr; /* The value is a dynamic string (len << 16 | offset) */ ptr = (void *)trace + (val & 0xffff); *len_p = val >> 16; offset += 4; *ptr_p = ptr; *offset_p = offset; } static enum print_line_t sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, struct trace_seq *s, struct trace_event *event) { unsigned char *ptr; int offset = 0; int bits, len; bool done = false; static const struct trace_print_flags __flags[] = { { O_TMPFILE, "O_TMPFILE" }, { O_WRONLY, "O_WRONLY" }, { O_RDWR, "O_RDWR" }, { O_CREAT, "O_CREAT" }, { O_EXCL, "O_EXCL" }, { O_NOCTTY, "O_NOCTTY" }, { O_TRUNC, "O_TRUNC" }, { O_APPEND, "O_APPEND" }, { O_NONBLOCK, "O_NONBLOCK" }, { O_DSYNC, "O_DSYNC" }, { O_DIRECT, "O_DIRECT" }, { O_LARGEFILE, "O_LARGEFILE" }, { O_DIRECTORY, "O_DIRECTORY" }, { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); for (int i = 0; !done && i < entry->nb_args; i++) { if (trace_seq_has_overflowed(s)) goto end; if (i) trace_seq_puts(s, ", "); switch (i) { case 2: bits = trace->args[2]; trace_seq_puts(s, "flags: "); /* No need to show mode when not creating the file */ if (!(bits & (O_CREAT|O_TMPFILE))) done = true; if (!(bits & O_ACCMODE)) { if (!bits) { trace_seq_puts(s, "O_RDONLY"); continue; } trace_seq_puts(s, "O_RDONLY|"); } trace_print_flags_seq(s, "|", bits, __flags); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. */ if (!trace_seq_has_overflowed(s)) trace_seq_pop(s); continue; case 3: trace_seq_printf(s, "%s: 0%03o", entry->args[i], (unsigned int)trace->args[i]); continue; } trace_seq_printf(s, "%s: %lu", entry->args[i], trace->args[i]); if (!(BIT(i) & entry->user_mask)) continue; get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); trace_seq_printf(s, " \"%.*s\"", len, ptr); } trace_seq_putc(s, ')'); end: trace_seq_putc(s, '\n'); return trace_handle_return(s); } static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; int i, syscall, val, len; unsigned char *ptr; int offset = 0; trace = (typeof(trace))ent; syscall = trace->nr; entry = syscall_nr_to_meta(syscall); if (!entry) goto end; if (entry->enter_event->event.type != ent->type) { WARN_ON_ONCE(1); goto end; } switch (entry->syscall_nr) { case __NR_openat: if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) return sys_enter_openat_print(trace, entry, s, event); break; default: break; } trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { bool printable = false; char *str; if (trace_seq_has_overflowed(s)) goto end; if (i) trace_seq_puts(s, ", "); /* parameter types */ if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ if (trace->args[i] < 10) trace_seq_printf(s, "%s: %lu", entry->args[i], trace->args[i]); else trace_seq_printf(s, "%s: 0x%lx", entry->args[i], trace->args[i]); if (!(BIT(i) & entry->user_mask)) continue; get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); if (entry->user_arg_size < 0 || entry->user_arg_is_str) { trace_seq_printf(s, " \"%.*s\"", len, ptr); continue; } val = trace->args[entry->user_arg_size]; str = ptr; trace_seq_puts(s, " ("); for (int x = 0; x < len; x++, ptr++) { if (isascii(*ptr) && isprint(*ptr)) printable = true; if (x) trace_seq_putc(s, ':'); trace_seq_printf(s, "%02x", *ptr); } if (len < val) trace_seq_printf(s, ", %s", EXTRA); trace_seq_putc(s, ')'); /* If nothing is printable, don't bother printing anything */ if (!printable) continue; trace_seq_puts(s, " \""); for (int x = 0; x < len; x++) { if (isascii(str[x]) && isprint(str[x])) trace_seq_putc(s, str[x]); else trace_seq_putc(s, '.'); } if (len < val) trace_seq_printf(s, "\"%s", EXTRA); else trace_seq_putc(s, '"'); } trace_seq_putc(s, ')'); end: trace_seq_putc(s, '\n'); return trace_handle_return(s); } static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; struct syscall_trace_exit *trace; int syscall; struct syscall_metadata *entry; trace = (typeof(trace))ent; syscall = trace->nr; entry = syscall_nr_to_meta(syscall); if (!entry) { trace_seq_putc(s, '\n'); goto out; } if (entry->exit_event->event.type != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); out: return trace_handle_return(s); } #define SYSCALL_FIELD(_type, _name) { \ .type = #_type, .name = #_name, \ .size = sizeof(_type), .align = __alignof__(_type), \ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __init sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int pos = 0; pos += snprintf(buf + pos, LEN_OR_ZERO, "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); pos += snprintf(buf + pos, LEN_OR_ZERO, " ((unsigned long)(REC->dfd)),"); pos += snprintf(buf + pos, LEN_OR_ZERO, " ((unsigned long)(REC->filename)),"); pos += snprintf(buf + pos, LEN_OR_ZERO, " __get_str(__filename_val),"); pos += snprintf(buf + pos, LEN_OR_ZERO, " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); pos += snprintf(buf + pos, LEN_OR_ZERO, " REC->flags ? __print_flags(REC->flags, \"|\", "); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); pos += snprintf(buf + pos, LEN_OR_ZERO, "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); pos += snprintf(buf + pos, LEN_OR_ZERO, " ((unsigned long)(REC->mode))"); return pos; } static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { bool is_string = entry->user_arg_is_str; int i; int pos = 0; switch (entry->syscall_nr) { case __NR_openat: return sys_enter_openat_print_fmt(entry, buf, len); default: break; } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { if (i) pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", entry->args[i], sizeof(unsigned long)); if (!(BIT(i) & entry->user_mask)) continue; /* Add the format for the user space string or array */ if (entry->user_arg_size < 0 || is_string) pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); else pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", ((unsigned long)(REC->%s))", entry->args[i]); if (!(BIT(i) & entry->user_mask)) continue; /* The user space data for arg has name ___val */ if (entry->user_arg_size < 0 || is_string) { pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", entry->args[i]); } else { pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", entry->args[i]); } } #undef LEN_OR_ZERO /* return the length of print_fmt */ return pos; } static int __init set_syscall_print_fmt(struct trace_event_call *call) { char *print_fmt; int len; struct syscall_metadata *entry = call->data; if (entry->enter_event != call) { call->print_fmt = "\"0x%lx\", REC->ret"; return 0; } /* First: called with 0 length to calculate the needed length */ len = __set_enter_print_fmt(entry, NULL, 0); print_fmt = kmalloc(len + 1, GFP_KERNEL); if (!print_fmt) return -ENOMEM; /* Second: actually write the @print_fmt */ __set_enter_print_fmt(entry, print_fmt, len + 1); call->print_fmt = print_fmt; return 0; } static void __init free_syscall_print_fmt(struct trace_event_call *call) { struct syscall_metadata *entry = call->data; if (entry->enter_event == call) kfree(call->print_fmt); } static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; unsigned long mask; char *arg; int offset = offsetof(typeof(trace), args); int ret = 0; int len; int i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, sizeof(unsigned long), 0, FILTER_OTHER); if (ret) break; offset += sizeof(unsigned long); } if (ret || !meta->user_mask) return ret; mask = meta->user_mask; while (mask) { int idx = ffs(mask) - 1; mask &= ~BIT(idx); /* * User space data is faulted into a temporary buffer and then * added as a dynamic string or array to the end of the event. * The user space data name for the arg pointer is * "___val". */ len = strlen(meta->args[idx]) + sizeof("___val"); arg = kmalloc(len, GFP_KERNEL); if (WARN_ON_ONCE(!arg)) { meta->user_mask = 0; return -ENOMEM; } snprintf(arg, len, "__%s_val", meta->args[idx]); ret = trace_define_field(call, "__data_loc char[]", arg, offset, sizeof(int), 0, FILTER_OTHER); if (ret) { kfree(arg); break; } offset += 4; } return ret; } /* * Create a per CPU temporary buffer to copy user space pointers into. * * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. * (defined in kernel/trace/trace.h) * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the * nul terminating byte and possibly appended EXTRA (4 bytes). * * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use * to copy memory from user space addresses into that will hold * 3 args as only 3 args are allowed to be copied from system calls. */ #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) #define SYSCALL_FAULT_MAX_CNT 3 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) /* Use the tracing per CPU buffer infrastructure to copy from user space */ struct syscall_user_buffer { struct trace_user_buf_info buf; struct rcu_head rcu; }; static struct syscall_user_buffer *syscall_buffer; static int syscall_fault_buffer_enable(void) { struct syscall_user_buffer *sbuf; int ret; lockdep_assert_held(&syscall_trace_lock); if (syscall_buffer) { trace_user_fault_get(&syscall_buffer->buf); return 0; } sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); if (!sbuf) return -ENOMEM; ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); if (ret < 0) { kfree(sbuf); return ret; } WRITE_ONCE(syscall_buffer, sbuf); return 0; } static void rcu_free_syscall_buffer(struct rcu_head *rcu) { struct syscall_user_buffer *sbuf = container_of(rcu, struct syscall_user_buffer, rcu); trace_user_fault_destroy(&sbuf->buf); kfree(sbuf); } static void syscall_fault_buffer_disable(void) { struct syscall_user_buffer *sbuf = syscall_buffer; lockdep_assert_held(&syscall_trace_lock); if (trace_user_fault_put(&sbuf->buf)) return; WRITE_ONCE(syscall_buffer, NULL); call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); } struct syscall_args { char *ptr_array[SYSCALL_FAULT_MAX_CNT]; int read[SYSCALL_FAULT_MAX_CNT]; int uargs; }; static int syscall_copy_user(char *buf, const char __user *ptr, size_t size, void *data) { struct syscall_args *args = data; int ret; for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { ptr = (char __user *)args->ptr_array[i]; ret = strncpy_from_user(buf, ptr, size); args->read[i] = ret; } return 0; } static int syscall_copy_user_array(char *buf, const char __user *ptr, size_t size, void *data) { struct syscall_args *args = data; int ret; for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { ptr = (char __user *)args->ptr_array[i]; ret = __copy_from_user(buf, ptr, size); args->read[i] = ret ? -1 : size; } return 0; } static char *sys_fault_user(unsigned int buf_size, struct syscall_metadata *sys_data, struct syscall_user_buffer *sbuf, unsigned long *args, unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) { trace_user_buf_copy syscall_copy = syscall_copy_user; unsigned long mask = sys_data->user_mask; unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; struct syscall_args sargs; bool array = false; char *buffer; char *buf; int ret; int i = 0; /* The extra is appended to the user data in the buffer */ BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= SYSCALL_FAULT_ARG_SZ); /* * If this system call event has a size argument, use * it to define how much of user space memory to read, * and read it as an array and not a string. */ if (sys_data->user_arg_size >= 0) { array = true; size = args[sys_data->user_arg_size]; if (size > SYSCALL_FAULT_ARG_SZ - 1) size = SYSCALL_FAULT_ARG_SZ - 1; syscall_copy = syscall_copy_user_array; } while (mask) { int idx = ffs(mask) - 1; mask &= ~BIT(idx); if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) break; /* Get the pointer to user space memory to read */ sargs.ptr_array[i++] = (char *)args[idx]; } sargs.uargs = i; /* Clear the values that are not used */ for (; i < SYSCALL_FAULT_MAX_CNT; i++) { data_size[i] = -1; /* Denotes no pointer */ } /* A zero size means do not even try */ if (!buf_size) return NULL; buffer = trace_user_fault_read(&sbuf->buf, NULL, size, syscall_copy, &sargs); if (!buffer) return NULL; buf = buffer; for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { ret = sargs.read[i]; if (ret < 0) continue; buf[ret] = '\0'; /* For strings, replace any non-printable characters with '.' */ if (!array) { for (int x = 0; x < ret; x++) { if (!isprint(buf[x])) buf[x] = '.'; } size = min(buf_size, SYSCALL_FAULT_USER_MAX); /* * If the text was truncated due to our max limit, * add "..." to the string. */ if (ret > size) { strscpy(buf + size, EXTRA, sizeof(EXTRA)); ret = size + sizeof(EXTRA); } else { buf[ret++] = '\0'; } } else { ret = min((unsigned int)ret, buf_size); } data_size[i] = ret; } return buffer; } static int syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, char **buffer, int *size, int *user_sizes, int *uargs, int buf_size) { struct syscall_user_buffer *sbuf; int i; /* If the syscall_buffer is NULL, tracing is being shutdown */ sbuf = READ_ONCE(syscall_buffer); if (!sbuf) return -1; *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); /* * user_size is the amount of data to append. * Need to add 4 for the meta field that points to * the user memory at the end of the event and also * stores its size. */ for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { if (user_sizes[i] < 0) break; *size += user_sizes[i] + 4; } /* Save the number of user read arguments of this syscall */ *uargs = i; return 0; } static void syscall_put_data(struct syscall_metadata *sys_data, struct syscall_trace_enter *entry, char *buffer, int size, int *user_sizes, int uargs) { char *buf = buffer; void *ptr; int val; /* * Set the pointer to point to the meta data of the event * that has information about the stored user space memory. */ ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; /* * The meta data will store the offset of the user data from * the beginning of the event. That is after the static arguments * and the meta data fields. */ val = (ptr - (void *)entry) + 4 * uargs; for (int i = 0; i < uargs; i++) { if (i) val += user_sizes[i - 1]; /* Store the offset and the size into the meta data */ *(int *)ptr = val | (user_sizes[i] << 16); /* Skip the meta data */ ptr += 4; } for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { /* Nothing to do if the user space was empty or faulted */ if (!user_sizes[i]) continue; memcpy(ptr, buf, user_sizes[i]); ptr += user_sizes[i]; } } static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; unsigned long args[6]; char *user_ptr; int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int syscall_nr; int size = 0; int uargs = 0; bool mayfault; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); if (!trace_file) return; if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; guard(preempt_notrace)(); syscall_get_arguments(current, regs, args); if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) return; } size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); trace_event_buffer_commit(&fbuffer); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct trace_event_buffer fbuffer; int syscall_nr; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); if (!trace_file) return; if (trace_trigger_soft_disabled(trace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); if (!entry) return; entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); trace_event_buffer_commit(&fbuffer); } static int reg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int ret = 0; int num; num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; guard(mutex)(&syscall_trace_lock); if (sys_data->user_mask) { ret = syscall_fault_buffer_enable(); if (ret < 0) return ret; } if (!tr->sys_refcount_enter) { ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (ret < 0) { if (sys_data->user_mask) syscall_fault_buffer_disable(); return ret; } } WRITE_ONCE(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; return 0; } static void unreg_event_syscall_enter(struct trace_event_file *file, struct trace_event_call *call) { struct syscall_metadata *sys_data = call->data; struct trace_array *tr = file->tr; int num; num = sys_data->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; guard(mutex)(&syscall_trace_lock); tr->sys_refcount_enter--; WRITE_ONCE(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); if (sys_data->user_mask) syscall_fault_buffer_disable(); } static int reg_event_syscall_exit(struct trace_event_file *file, struct trace_event_call *call) { struct trace_array *tr = file->tr; int ret = 0; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { WRITE_ONCE(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } static void unreg_event_syscall_exit(struct trace_event_file *file, struct trace_event_call *call) { struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; WRITE_ONCE(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } /* * For system calls that reference user space memory that can * be recorded into the event, set the system call meta data's user_mask * to the "args" index that points to the user space memory to retrieve. */ static void check_faultable_syscall(struct trace_event_call *call, int nr) { struct syscall_metadata *sys_data = call->data; unsigned long mask; /* Only work on entry */ if (sys_data->enter_event != call) return; sys_data->user_arg_size = -1; switch (nr) { /* user arg 1 with size arg at 2 */ case __NR_write: #ifdef __NR_mq_timedsend case __NR_mq_timedsend: #endif case __NR_pwrite64: sys_data->user_mask = BIT(1); sys_data->user_arg_size = 2; break; /* user arg 0 with size arg at 1 as string */ case __NR_setdomainname: case __NR_sethostname: sys_data->user_mask = BIT(0); sys_data->user_arg_size = 1; sys_data->user_arg_is_str = 1; break; #ifdef __NR_kexec_file_load /* user arg 4 with size arg at 3 as string */ case __NR_kexec_file_load: sys_data->user_mask = BIT(4); sys_data->user_arg_size = 3; sys_data->user_arg_is_str = 1; break; #endif /* user arg at position 0 */ #ifdef __NR_access case __NR_access: #endif case __NR_acct: case __NR_chdir: #ifdef __NR_chown case __NR_chown: #endif #ifdef __NR_chmod case __NR_chmod: #endif case __NR_chroot: #ifdef __NR_creat case __NR_creat: #endif case __NR_delete_module: case __NR_execve: case __NR_fsopen: #ifdef __NR_lchown case __NR_lchown: #endif #ifdef __NR_open case __NR_open: #endif case __NR_memfd_create: #ifdef __NR_mkdir case __NR_mkdir: #endif #ifdef __NR_mknod case __NR_mknod: #endif case __NR_mq_open: case __NR_mq_unlink: #ifdef __NR_readlink case __NR_readlink: #endif #ifdef __NR_rmdir case __NR_rmdir: #endif case __NR_shmdt: #ifdef __NR_statfs case __NR_statfs: #endif case __NR_swapon: case __NR_swapoff: #ifdef __NR_truncate case __NR_truncate: #endif #ifdef __NR_unlink case __NR_unlink: #endif case __NR_umount2: #ifdef __NR_utime case __NR_utime: #endif #ifdef __NR_utimes case __NR_utimes: #endif sys_data->user_mask = BIT(0); break; /* user arg at position 1 */ case __NR_execveat: case __NR_faccessat: case __NR_faccessat2: case __NR_finit_module: case __NR_fchmodat: case __NR_fchmodat2: case __NR_fchownat: case __NR_fgetxattr: case __NR_flistxattr: case __NR_fsetxattr: case __NR_fspick: case __NR_fremovexattr: #ifdef __NR_futimesat case __NR_futimesat: #endif case __NR_inotify_add_watch: case __NR_mkdirat: case __NR_mknodat: case __NR_mount_setattr: case __NR_name_to_handle_at: #ifdef __NR_newfstatat case __NR_newfstatat: #endif case __NR_openat: case __NR_openat2: case __NR_open_tree: case __NR_open_tree_attr: case __NR_readlinkat: case __NR_quotactl: case __NR_syslog: case __NR_statx: case __NR_unlinkat: #ifdef __NR_utimensat case __NR_utimensat: #endif sys_data->user_mask = BIT(1); break; /* user arg at position 2 */ case __NR_init_module: case __NR_fsconfig: sys_data->user_mask = BIT(2); break; /* user arg at position 4 */ case __NR_fanotify_mark: sys_data->user_mask = BIT(4); break; /* 2 user args, 0 and 1 */ case __NR_add_key: case __NR_getxattr: case __NR_lgetxattr: case __NR_lremovexattr: #ifdef __NR_link case __NR_link: #endif case __NR_listxattr: case __NR_llistxattr: case __NR_lsetxattr: case __NR_pivot_root: case __NR_removexattr: #ifdef __NR_rename case __NR_rename: #endif case __NR_request_key: case __NR_setxattr: #ifdef __NR_symlink case __NR_symlink: #endif sys_data->user_mask = BIT(0) | BIT(1); break; /* 2 user args, 0 and 2 */ case __NR_symlinkat: sys_data->user_mask = BIT(0) | BIT(2); break; /* 2 user args, 1 and 3 */ case __NR_getxattrat: case __NR_linkat: case __NR_listxattrat: case __NR_move_mount: #ifdef __NR_renameat case __NR_renameat: #endif case __NR_renameat2: case __NR_removexattrat: case __NR_setxattrat: sys_data->user_mask = BIT(1) | BIT(3); break; case __NR_mount: /* Just dev_name and dir_name, TODO add type */ sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); break; default: sys_data->user_mask = 0; return; } if (sys_data->user_arg_size < 0) return; /* * The user_arg_size can only be used when the system call * is reading only a single address from user space. */ mask = sys_data->user_mask; if (WARN_ON(mask & (mask - 1))) sys_data->user_arg_size = -1; } static int __init init_syscall_trace(struct trace_event_call *call) { int id; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) { pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", ((struct syscall_metadata *)call->data)->name); return -ENOSYS; } check_faultable_syscall(call, num); if (set_syscall_print_fmt(call) < 0) return -ENOMEM; id = trace_event_raw_init(call); if (id < 0) { free_syscall_print_fmt(call); return id; } return id; } static struct trace_event_fields __refdata syscall_enter_fields_array[] = { SYSCALL_FIELD(int, __syscall_nr), { .type = TRACE_FUNCTION_TYPE, .define_fields = syscall_enter_define_fields }, {} }; struct trace_event_functions enter_syscall_print_funcs = { .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .fields_array = syscall_enter_fields_array, .get_fields = syscall_get_enter_fields, .raw_init = init_syscall_trace, }; struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .fields_array = (struct trace_event_fields[]){ SYSCALL_FIELD(int, __syscall_nr), SYSCALL_FIELD(long, ret), {} }, .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), .raw_init = init_syscall_trace, }; unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } void __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; int i; void *ret; if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return; } } for (i = 0; i < NR_syscalls; i++) { addr = arch_syscall_addr(i); meta = find_syscall_meta(addr); if (!meta) continue; meta->syscall_nr = i; if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { syscalls_metadata[i] = meta; } else { ret = xa_store(&syscalls_metadata_sparse, i, meta, GFP_KERNEL); WARN(xa_is_err(ret), "Syscall memory allocation failed\n"); } } } #ifdef CONFIG_PERF_EVENTS static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, struct syscall_metadata *sys_data, struct syscall_trace_enter *rec) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) param.args[i] = rec->args[i]; return trace_call_bpf(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; bool mayfault; char *user_ptr; int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; int syscall_nr; int rctx; int size = 0; int uargs = 0; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; syscall_get_arguments(current, regs, args); /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; if (mayfault) { if (syscall_get_data(sys_data, args, &user_ptr, &size, user_sizes, &uargs, buf_size) < 0) return; } head = this_cpu_ptr(sys_data->enter_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; rec->nr = syscall_nr; memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) { struct syscall_metadata *sys_data = call->data; int num; int ret; num = sys_data->syscall_nr; guard(mutex)(&syscall_trace_lock); if (sys_data->user_mask) { ret = syscall_fault_buffer_enable(); if (ret < 0) return ret; } if (!sys_perf_refcount_enter) { ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { pr_info("event trace: Could not activate syscall entry trace point"); if (sys_data->user_mask) syscall_fault_buffer_disable(); return ret; } } set_bit(num, enabled_perf_enter_syscalls); sys_perf_refcount_enter++; return 0; } static void perf_sysenter_disable(struct trace_event_call *call) { struct syscall_metadata *sys_data = call->data; int num; num = sys_data->syscall_nr; guard(mutex)(&syscall_trace_lock); sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) unregister_trace_sys_enter(perf_syscall_enter, NULL); if (sys_data->user_mask) syscall_fault_buffer_disable(); } static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, struct syscall_trace_exit *rec) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; return trace_call_bpf(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; /* * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; head = this_cpu_ptr(sys_data->exit_event->perf_events); valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); if (!valid_prog_array && hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); if ((valid_prog_array && !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; guard(mutex)(&syscall_trace_lock); if (!sys_perf_refcount_exit) { int ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { pr_info("event trace: Could not activate syscall exit trace point"); return ret; } } set_bit(num, enabled_perf_exit_syscalls); sys_perf_refcount_exit++; return 0; } static void perf_sysexit_disable(struct trace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; guard(mutex)(&syscall_trace_lock); sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) unregister_trace_sys_exit(perf_syscall_exit, NULL); } #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: return reg_event_syscall_enter(file, event); case TRACE_REG_UNREGISTER: unreg_event_syscall_enter(file, event); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return perf_sysenter_enable(event); case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } static int syscall_exit_register(struct trace_event_call *event, enum trace_reg type, void *data) { struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: return reg_event_syscall_exit(file, event); case TRACE_REG_UNREGISTER: unreg_event_syscall_exit(file, event); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return perf_sysexit_enable(event); case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; }