linux/kernel/trace/trace_syscalls.c

1504 lines
36 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
#include <linux/kernel_stat.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
#include <linux/ftrace.h>
#include <linux/perf_event.h>
#include <linux/xarray.h>
#include <asm/syscall.h>
#include "trace_output.h"
#include "trace.h"
static DEFINE_MUTEX(syscall_trace_lock);
static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data);
static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data);
static struct list_head *
syscall_get_enter_fields(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
return &entry->enter_fields;
}
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
static DEFINE_XARRAY(syscalls_metadata_sparse);
static struct syscall_metadata **syscalls_metadata;
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
/*
* Only compare after the "sys" prefix. Archs that use
* syscall wrappers may have syscalls symbols aliases prefixed
* with ".SyS" or ".sys" instead of "sys", leading to an unwanted
* mismatch.
*/
return !strcmp(sym + 3, name + 3);
}
#endif
#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
/*
* Some architectures that allow for 32bit applications
* to run on a 64bit kernel, do not map the syscalls for
* the 32bit tasks the same as they do for 64bit tasks.
*
* *cough*x86*cough*
*
* In such a case, instead of reporting the wrong syscalls,
* simply ignore them.
*
* For an arch to ignore the compat syscalls it needs to
* define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
* define the function arch_trace_is_compat_syscall() to let
* the tracing system know that it should ignore it.
*/
static int
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
{
if (unlikely(arch_trace_is_compat_syscall(regs)))
return -1;
return syscall_get_nr(task, regs);
}
#else
static inline int
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
{
return syscall_get_nr(task, regs);
}
#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
struct syscall_metadata **start;
struct syscall_metadata **stop;
char str[KSYM_SYMBOL_LEN];
start = __start_syscalls_metadata;
stop = __stop_syscalls_metadata;
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
return NULL;
for ( ; start < stop; start++) {
if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
return *start;
}
return NULL;
}
static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
return syscalls_metadata[nr];
}
const char *get_syscall_name(int syscall)
{
struct syscall_metadata *entry;
entry = syscall_nr_to_meta(syscall);
if (!entry)
return NULL;
return entry->name;
}
/* Added to user strings or arrays when max limit is reached */
#define EXTRA "..."
static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
int i, syscall, val, len;
unsigned char *ptr;
int offset = 0;
trace = (typeof(trace))ent;
syscall = trace->nr;
entry = syscall_nr_to_meta(syscall);
if (!entry)
goto end;
if (entry->enter_event->event.type != ent->type) {
WARN_ON_ONCE(1);
goto end;
}
trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
bool printable = false;
char *str;
if (trace_seq_has_overflowed(s))
goto end;
if (i)
trace_seq_puts(s, ", ");
/* parameter types */
if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
if (trace->args[i] < 10)
trace_seq_printf(s, "%s: %lu", entry->args[i],
trace->args[i]);
else
trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
trace->args[i]);
if (!(BIT(i) & entry->user_mask))
continue;
/* This arg points to a user space string */
ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
val = *(int *)ptr;
/* The value is a dynamic string (len << 16 | offset) */
ptr = (void *)ent + (val & 0xffff);
len = val >> 16;
offset += 4;
if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
trace_seq_printf(s, " \"%.*s\"", len, ptr);
continue;
}
val = trace->args[entry->user_arg_size];
str = ptr;
trace_seq_puts(s, " (");
for (int x = 0; x < len; x++, ptr++) {
if (isascii(*ptr) && isprint(*ptr))
printable = true;
if (x)
trace_seq_putc(s, ':');
trace_seq_printf(s, "%02x", *ptr);
}
if (len < val)
trace_seq_printf(s, ", %s", EXTRA);
trace_seq_putc(s, ')');
/* If nothing is printable, don't bother printing anything */
if (!printable)
continue;
trace_seq_puts(s, " \"");
for (int x = 0; x < len; x++) {
if (isascii(str[x]) && isprint(str[x]))
trace_seq_putc(s, str[x]);
else
trace_seq_putc(s, '.');
}
if (len < val)
trace_seq_printf(s, "\"%s", EXTRA);
else
trace_seq_putc(s, '"');
}
trace_seq_putc(s, ')');
end:
trace_seq_putc(s, '\n');
return trace_handle_return(s);
}
static enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
trace = (typeof(trace))ent;
syscall = trace->nr;
entry = syscall_nr_to_meta(syscall);
if (!entry) {
trace_seq_putc(s, '\n');
goto out;
}
if (entry->exit_event->event.type != ent->type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
out:
return trace_handle_return(s);
}
#define SYSCALL_FIELD(_type, _name) { \
.type = #_type, .name = #_name, \
.size = sizeof(_type), .align = __alignof__(_type), \
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
bool is_string = entry->user_arg_is_str;
int i;
int pos = 0;
/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
if (i)
pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
entry->args[i], sizeof(unsigned long));
if (!(BIT(i) & entry->user_mask))
continue;
/* Add the format for the user space string or array */
if (entry->user_arg_size < 0 || is_string)
pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
else
pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
for (i = 0; i < entry->nb_args; i++) {
pos += snprintf(buf + pos, LEN_OR_ZERO,
", ((unsigned long)(REC->%s))", entry->args[i]);
if (!(BIT(i) & entry->user_mask))
continue;
/* The user space data for arg has name __<arg>_val */
if (entry->user_arg_size < 0 || is_string) {
pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
entry->args[i]);
} else {
pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
entry->args[i]);
}
}
#undef LEN_OR_ZERO
/* return the length of print_fmt */
return pos;
}
static int __init set_syscall_print_fmt(struct trace_event_call *call)
{
char *print_fmt;
int len;
struct syscall_metadata *entry = call->data;
if (entry->enter_event != call) {
call->print_fmt = "\"0x%lx\", REC->ret";
return 0;
}
/* First: called with 0 length to calculate the needed length */
len = __set_enter_print_fmt(entry, NULL, 0);
print_fmt = kmalloc(len + 1, GFP_KERNEL);
if (!print_fmt)
return -ENOMEM;
/* Second: actually write the @print_fmt */
__set_enter_print_fmt(entry, print_fmt, len + 1);
call->print_fmt = print_fmt;
return 0;
}
static void __init free_syscall_print_fmt(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
if (entry->enter_event == call)
kfree(call->print_fmt);
}
static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
unsigned long mask;
char *arg;
int offset = offsetof(typeof(trace), args);
int ret = 0;
int len;
int i;
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],
meta->args[i], offset,
sizeof(unsigned long), 0,
FILTER_OTHER);
if (ret)
break;
offset += sizeof(unsigned long);
}
if (ret || !meta->user_mask)
return ret;
mask = meta->user_mask;
while (mask) {
int idx = ffs(mask) - 1;
mask &= ~BIT(idx);
/*
* User space data is faulted into a temporary buffer and then
* added as a dynamic string or array to the end of the event.
* The user space data name for the arg pointer is
* "__<arg>_val".
*/
len = strlen(meta->args[idx]) + sizeof("___val");
arg = kmalloc(len, GFP_KERNEL);
if (WARN_ON_ONCE(!arg)) {
meta->user_mask = 0;
return -ENOMEM;
}
snprintf(arg, len, "__%s_val", meta->args[idx]);
ret = trace_define_field(call, "__data_loc char[]",
arg, offset, sizeof(int), 0,
FILTER_OTHER);
if (ret) {
kfree(arg);
break;
}
offset += 4;
}
return ret;
}
/*
* Create a per CPU temporary buffer to copy user space pointers into.
*
* SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
* (defined in kernel/trace/trace.h)
* SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
* nul terminating byte and possibly appended EXTRA (4 bytes).
*
* SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
* to copy memory from user space addresses into that will hold
* 3 args as only 3 args are allowed to be copied from system calls.
*/
#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
#define SYSCALL_FAULT_MAX_CNT 3
#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
/* Use the tracing per CPU buffer infrastructure to copy from user space */
struct syscall_user_buffer {
struct trace_user_buf_info buf;
struct rcu_head rcu;
};
static struct syscall_user_buffer *syscall_buffer;
static int syscall_fault_buffer_enable(void)
{
struct syscall_user_buffer *sbuf;
int ret;
lockdep_assert_held(&syscall_trace_lock);
if (syscall_buffer) {
trace_user_fault_get(&syscall_buffer->buf);
return 0;
}
sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
if (!sbuf)
return -ENOMEM;
ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
if (ret < 0) {
kfree(sbuf);
return ret;
}
WRITE_ONCE(syscall_buffer, sbuf);
return 0;
}
static void rcu_free_syscall_buffer(struct rcu_head *rcu)
{
struct syscall_user_buffer *sbuf =
container_of(rcu, struct syscall_user_buffer, rcu);
trace_user_fault_destroy(&sbuf->buf);
kfree(sbuf);
}
static void syscall_fault_buffer_disable(void)
{
struct syscall_user_buffer *sbuf = syscall_buffer;
lockdep_assert_held(&syscall_trace_lock);
if (trace_user_fault_put(&sbuf->buf))
return;
WRITE_ONCE(syscall_buffer, NULL);
call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
}
struct syscall_args {
char *ptr_array[SYSCALL_FAULT_MAX_CNT];
int read[SYSCALL_FAULT_MAX_CNT];
int uargs;
};
static int syscall_copy_user(char *buf, const char __user *ptr,
size_t size, void *data)
{
struct syscall_args *args = data;
int ret;
for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
ptr = (char __user *)args->ptr_array[i];
ret = strncpy_from_user(buf, ptr, size);
args->read[i] = ret;
}
return 0;
}
static int syscall_copy_user_array(char *buf, const char __user *ptr,
size_t size, void *data)
{
struct syscall_args *args = data;
int ret;
for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
ptr = (char __user *)args->ptr_array[i];
ret = __copy_from_user(buf, ptr, size);
args->read[i] = ret ? -1 : size;
}
return 0;
}
static char *sys_fault_user(unsigned int buf_size,
struct syscall_metadata *sys_data,
struct syscall_user_buffer *sbuf,
unsigned long *args,
unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
{
trace_user_buf_copy syscall_copy = syscall_copy_user;
unsigned long mask = sys_data->user_mask;
unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
struct syscall_args sargs;
bool array = false;
char *buffer;
char *buf;
int ret;
int i = 0;
/* The extra is appended to the user data in the buffer */
BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
SYSCALL_FAULT_ARG_SZ);
/*
* If this system call event has a size argument, use
* it to define how much of user space memory to read,
* and read it as an array and not a string.
*/
if (sys_data->user_arg_size >= 0) {
array = true;
size = args[sys_data->user_arg_size];
if (size > SYSCALL_FAULT_ARG_SZ - 1)
size = SYSCALL_FAULT_ARG_SZ - 1;
syscall_copy = syscall_copy_user_array;
}
while (mask) {
int idx = ffs(mask) - 1;
mask &= ~BIT(idx);
if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
break;
/* Get the pointer to user space memory to read */
sargs.ptr_array[i++] = (char *)args[idx];
}
sargs.uargs = i;
/* Clear the values that are not used */
for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
data_size[i] = -1; /* Denotes no pointer */
}
/* A zero size means do not even try */
if (!buf_size)
return NULL;
buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
syscall_copy, &sargs);
if (!buffer)
return NULL;
buf = buffer;
for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
ret = sargs.read[i];
if (ret < 0)
continue;
buf[ret] = '\0';
/* For strings, replace any non-printable characters with '.' */
if (!array) {
for (int x = 0; x < ret; x++) {
if (!isprint(buf[x]))
buf[x] = '.';
}
size = min(buf_size, SYSCALL_FAULT_USER_MAX);
/*
* If the text was truncated due to our max limit,
* add "..." to the string.
*/
if (ret > size) {
strscpy(buf + size, EXTRA, sizeof(EXTRA));
ret = size + sizeof(EXTRA);
} else {
buf[ret++] = '\0';
}
} else {
ret = min((unsigned int)ret, buf_size);
}
data_size[i] = ret;
}
return buffer;
}
static int
syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
char **buffer, int *size, int *user_sizes, int *uargs,
int buf_size)
{
struct syscall_user_buffer *sbuf;
int i;
/* If the syscall_buffer is NULL, tracing is being shutdown */
sbuf = READ_ONCE(syscall_buffer);
if (!sbuf)
return -1;
*buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
/*
* user_size is the amount of data to append.
* Need to add 4 for the meta field that points to
* the user memory at the end of the event and also
* stores its size.
*/
for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
if (user_sizes[i] < 0)
break;
*size += user_sizes[i] + 4;
}
/* Save the number of user read arguments of this syscall */
*uargs = i;
return 0;
}
static void syscall_put_data(struct syscall_metadata *sys_data,
struct syscall_trace_enter *entry,
char *buffer, int size, int *user_sizes, int uargs)
{
char *buf = buffer;
void *ptr;
int val;
/*
* Set the pointer to point to the meta data of the event
* that has information about the stored user space memory.
*/
ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
/*
* The meta data will store the offset of the user data from
* the beginning of the event. That is after the static arguments
* and the meta data fields.
*/
val = (ptr - (void *)entry) + 4 * uargs;
for (int i = 0; i < uargs; i++) {
if (i)
val += user_sizes[i - 1];
/* Store the offset and the size into the meta data */
*(int *)ptr = val | (user_sizes[i] << 16);
/* Skip the meta data */
ptr += 4;
}
for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
/* Nothing to do if the user space was empty or faulted */
if (!user_sizes[i])
continue;
memcpy(ptr, buf, user_sizes[i]);
ptr += user_sizes[i];
}
}
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct trace_event_buffer fbuffer;
unsigned long args[6];
char *user_ptr;
int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
int syscall_nr;
int size = 0;
int uargs = 0;
bool mayfault;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
if (!trace_file)
return;
if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
guard(preempt_notrace)();
syscall_get_arguments(current, regs, args);
if (mayfault) {
if (syscall_get_data(sys_data, args, &user_ptr,
&size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
return;
}
size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
if (mayfault)
syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
trace_event_buffer_commit(&fbuffer);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
struct trace_array *tr = data;
struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct trace_event_buffer fbuffer;
int syscall_nr;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
if (!trace_file)
return;
if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
if (!entry)
return;
entry = ring_buffer_event_data(fbuffer.event);
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
trace_event_buffer_commit(&fbuffer);
}
static int reg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int ret = 0;
int num;
num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
guard(mutex)(&syscall_trace_lock);
if (sys_data->user_mask) {
ret = syscall_fault_buffer_enable();
if (ret < 0)
return ret;
}
if (!tr->sys_refcount_enter) {
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (ret < 0) {
if (sys_data->user_mask)
syscall_fault_buffer_disable();
return ret;
}
}
WRITE_ONCE(tr->enter_syscall_files[num], file);
tr->sys_refcount_enter++;
return 0;
}
static void unreg_event_syscall_enter(struct trace_event_file *file,
struct trace_event_call *call)
{
struct syscall_metadata *sys_data = call->data;
struct trace_array *tr = file->tr;
int num;
num = sys_data->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
guard(mutex)(&syscall_trace_lock);
tr->sys_refcount_enter--;
WRITE_ONCE(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
if (sys_data->user_mask)
syscall_fault_buffer_disable();
}
static int reg_event_syscall_exit(struct trace_event_file *file,
struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
WRITE_ONCE(tr->exit_syscall_files[num], file);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
static void unreg_event_syscall_exit(struct trace_event_file *file,
struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
WRITE_ONCE(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
}
/*
* For system calls that reference user space memory that can
* be recorded into the event, set the system call meta data's user_mask
* to the "args" index that points to the user space memory to retrieve.
*/
static void check_faultable_syscall(struct trace_event_call *call, int nr)
{
struct syscall_metadata *sys_data = call->data;
unsigned long mask;
/* Only work on entry */
if (sys_data->enter_event != call)
return;
sys_data->user_arg_size = -1;
switch (nr) {
/* user arg 1 with size arg at 2 */
case __NR_write:
case __NR_mq_timedsend:
case __NR_pwrite64:
sys_data->user_mask = BIT(1);
sys_data->user_arg_size = 2;
break;
/* user arg 0 with size arg at 1 as string */
case __NR_setdomainname:
case __NR_sethostname:
sys_data->user_mask = BIT(0);
sys_data->user_arg_size = 1;
sys_data->user_arg_is_str = 1;
break;
#ifdef __NR_kexec_file_load
/* user arg 4 with size arg at 3 as string */
case __NR_kexec_file_load:
sys_data->user_mask = BIT(4);
sys_data->user_arg_size = 3;
sys_data->user_arg_is_str = 1;
break;
#endif
/* user arg at position 0 */
#ifdef __NR_access
case __NR_access:
#endif
case __NR_acct:
case __NR_chdir:
#ifdef __NR_chown
case __NR_chown:
#endif
#ifdef __NR_chmod
case __NR_chmod:
#endif
case __NR_chroot:
#ifdef __NR_creat
case __NR_creat:
#endif
case __NR_delete_module:
case __NR_execve:
case __NR_fsopen:
#ifdef __NR_lchown
case __NR_lchown:
#endif
#ifdef __NR_open
case __NR_open:
#endif
case __NR_memfd_create:
#ifdef __NR_mkdir
case __NR_mkdir:
#endif
#ifdef __NR_mknod
case __NR_mknod:
#endif
case __NR_mq_open:
case __NR_mq_unlink:
#ifdef __NR_readlink
case __NR_readlink:
#endif
#ifdef __NR_rmdir
case __NR_rmdir:
#endif
case __NR_shmdt:
#ifdef __NR_statfs
case __NR_statfs:
#endif
case __NR_swapon:
case __NR_swapoff:
#ifdef __NR_truncate
case __NR_truncate:
#endif
#ifdef __NR_unlink
case __NR_unlink:
#endif
case __NR_umount2:
#ifdef __NR_utime
case __NR_utime:
#endif
#ifdef __NR_utimes
case __NR_utimes:
#endif
sys_data->user_mask = BIT(0);
break;
/* user arg at position 1 */
case __NR_execveat:
case __NR_faccessat:
case __NR_faccessat2:
case __NR_finit_module:
case __NR_fchmodat:
case __NR_fchmodat2:
case __NR_fchownat:
case __NR_fgetxattr:
case __NR_flistxattr:
case __NR_fsetxattr:
case __NR_fspick:
case __NR_fremovexattr:
#ifdef __NR_futimesat
case __NR_futimesat:
#endif
case __NR_inotify_add_watch:
case __NR_mkdirat:
case __NR_mknodat:
case __NR_mount_setattr:
case __NR_name_to_handle_at:
#ifdef __NR_newfstatat
case __NR_newfstatat:
#endif
case __NR_openat:
case __NR_openat2:
case __NR_open_tree:
case __NR_open_tree_attr:
case __NR_readlinkat:
case __NR_quotactl:
case __NR_syslog:
case __NR_statx:
case __NR_unlinkat:
case __NR_utimensat:
sys_data->user_mask = BIT(1);
break;
/* user arg at position 2 */
case __NR_init_module:
case __NR_fsconfig:
sys_data->user_mask = BIT(2);
break;
/* user arg at position 4 */
case __NR_fanotify_mark:
sys_data->user_mask = BIT(4);
break;
/* 2 user args, 0 and 1 */
case __NR_add_key:
case __NR_getxattr:
case __NR_lgetxattr:
case __NR_lremovexattr:
#ifdef __NR_link
case __NR_link:
#endif
case __NR_listxattr:
case __NR_llistxattr:
case __NR_lsetxattr:
case __NR_pivot_root:
case __NR_removexattr:
#ifdef __NR_rename
case __NR_rename:
#endif
case __NR_request_key:
case __NR_setxattr:
#ifdef __NR_symlink
case __NR_symlink:
#endif
sys_data->user_mask = BIT(0) | BIT(1);
break;
/* 2 user args, 0 and 2 */
case __NR_symlinkat:
sys_data->user_mask = BIT(0) | BIT(2);
break;
/* 2 user args, 1 and 3 */
case __NR_getxattrat:
case __NR_linkat:
case __NR_listxattrat:
case __NR_move_mount:
#ifdef __NR_renameat
case __NR_renameat:
#endif
case __NR_renameat2:
case __NR_removexattrat:
case __NR_setxattrat:
sys_data->user_mask = BIT(1) | BIT(3);
break;
case __NR_mount: /* Just dev_name and dir_name, TODO add type */
sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
break;
default:
sys_data->user_mask = 0;
return;
}
if (sys_data->user_arg_size < 0)
return;
/*
* The user_arg_size can only be used when the system call
* is reading only a single address from user space.
*/
mask = sys_data->user_mask;
if (WARN_ON(mask & (mask - 1)))
sys_data->user_arg_size = -1;
}
static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (num < 0 || num >= NR_syscalls) {
pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
((struct syscall_metadata *)call->data)->name);
return -ENOSYS;
}
check_faultable_syscall(call, num);
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
id = trace_event_raw_init(call);
if (id < 0) {
free_syscall_print_fmt(call);
return id;
}
return id;
}
static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
SYSCALL_FIELD(int, __syscall_nr),
{ .type = TRACE_FUNCTION_TYPE,
.define_fields = syscall_enter_define_fields },
{}
};
struct trace_event_functions enter_syscall_print_funcs = {
.trace = print_syscall_enter,
};
struct trace_event_functions exit_syscall_print_funcs = {
.trace = print_syscall_exit,
};
struct trace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
.fields_array = syscall_enter_fields_array,
.get_fields = syscall_get_enter_fields,
.raw_init = init_syscall_trace,
};
struct trace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
.fields_array = (struct trace_event_fields[]){
SYSCALL_FIELD(int, __syscall_nr),
SYSCALL_FIELD(long, ret),
{}
},
.fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
.raw_init = init_syscall_trace,
};
unsigned long __init __weak arch_syscall_addr(int nr)
{
return (unsigned long)sys_call_table[nr];
}
void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
int i;
void *ret;
if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
syscalls_metadata = kcalloc(NR_syscalls,
sizeof(*syscalls_metadata),
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
return;
}
}
for (i = 0; i < NR_syscalls; i++) {
addr = arch_syscall_addr(i);
meta = find_syscall_meta(addr);
if (!meta)
continue;
meta->syscall_nr = i;
if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
syscalls_metadata[i] = meta;
} else {
ret = xa_store(&syscalls_metadata_sparse, i, meta,
GFP_KERNEL);
WARN(xa_is_err(ret),
"Syscall memory allocation failed\n");
}
}
}
#ifdef CONFIG_PERF_EVENTS
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
struct syscall_metadata *sys_data,
struct syscall_trace_enter *rec)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
int i;
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
param.args[i] = rec->args[i];
return trace_call_bpf(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
bool mayfault;
char *user_ptr;
int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
int syscall_nr;
int rctx;
int size = 0;
int uargs = 0;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
syscall_get_arguments(current, regs, args);
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
if (mayfault) {
if (syscall_get_data(sys_data, args, &user_ptr,
&size, user_sizes, &uargs, buf_size) < 0)
return;
}
head = this_cpu_ptr(sys_data->enter_event->perf_events);
valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if (mayfault)
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
{
struct syscall_metadata *sys_data = call->data;
int num;
int ret;
num = sys_data->syscall_nr;
guard(mutex)(&syscall_trace_lock);
if (sys_data->user_mask) {
ret = syscall_fault_buffer_enable();
if (ret < 0)
return ret;
}
if (!sys_perf_refcount_enter) {
ret = register_trace_sys_enter(perf_syscall_enter, NULL);
if (ret) {
pr_info("event trace: Could not activate syscall entry trace point");
if (sys_data->user_mask)
syscall_fault_buffer_disable();
return ret;
}
}
set_bit(num, enabled_perf_enter_syscalls);
sys_perf_refcount_enter++;
return 0;
}
static void perf_sysenter_disable(struct trace_event_call *call)
{
struct syscall_metadata *sys_data = call->data;
int num;
num = sys_data->syscall_nr;
guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
unregister_trace_sys_enter(perf_syscall_enter, NULL);
if (sys_data->user_mask)
syscall_fault_buffer_disable();
}
static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
struct syscall_trace_exit *rec)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long ret;
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
return trace_call_bpf(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
/*
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
head = this_cpu_ptr(sys_data->exit_event->perf_events);
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
!perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
{
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
guard(mutex)(&syscall_trace_lock);
if (!sys_perf_refcount_exit) {
int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
if (ret) {
pr_info("event trace: Could not activate syscall exit trace point");
return ret;
}
}
set_bit(num, enabled_perf_exit_syscalls);
sys_perf_refcount_exit++;
return 0;
}
static void perf_sysexit_disable(struct trace_event_call *call)
{
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
guard(mutex)(&syscall_trace_lock);
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
unregister_trace_sys_exit(perf_syscall_exit, NULL);
}
#endif /* CONFIG_PERF_EVENTS */
static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
return reg_event_syscall_enter(file, event);
case TRACE_REG_UNREGISTER:
unreg_event_syscall_enter(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return perf_sysenter_enable(event);
case TRACE_REG_PERF_UNREGISTER:
perf_sysenter_disable(event);
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
case TRACE_REG_PERF_DEL:
return 0;
#endif
}
return 0;
}
static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
return reg_event_syscall_exit(file, event);
case TRACE_REG_UNREGISTER:
unreg_event_syscall_exit(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return perf_sysexit_enable(event);
case TRACE_REG_PERF_UNREGISTER:
perf_sysexit_disable(event);
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
case TRACE_REG_PERF_DEL:
return 0;
#endif
}
return 0;
}