mirror of https://github.com/torvalds/linux.git
1504 lines
36 KiB
C
1504 lines
36 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <trace/syscall.h>
|
|
#include <trace/events/syscalls.h>
|
|
#include <linux/kernel_stat.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
|
|
#include <linux/ftrace.h>
|
|
#include <linux/perf_event.h>
|
|
#include <linux/xarray.h>
|
|
#include <asm/syscall.h>
|
|
|
|
#include "trace_output.h"
|
|
#include "trace.h"
|
|
|
|
static DEFINE_MUTEX(syscall_trace_lock);
|
|
|
|
static int syscall_enter_register(struct trace_event_call *event,
|
|
enum trace_reg type, void *data);
|
|
static int syscall_exit_register(struct trace_event_call *event,
|
|
enum trace_reg type, void *data);
|
|
|
|
static struct list_head *
|
|
syscall_get_enter_fields(struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *entry = call->data;
|
|
|
|
return &entry->enter_fields;
|
|
}
|
|
|
|
extern struct syscall_metadata *__start_syscalls_metadata[];
|
|
extern struct syscall_metadata *__stop_syscalls_metadata[];
|
|
|
|
static DEFINE_XARRAY(syscalls_metadata_sparse);
|
|
static struct syscall_metadata **syscalls_metadata;
|
|
|
|
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
|
|
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
|
|
{
|
|
/*
|
|
* Only compare after the "sys" prefix. Archs that use
|
|
* syscall wrappers may have syscalls symbols aliases prefixed
|
|
* with ".SyS" or ".sys" instead of "sys", leading to an unwanted
|
|
* mismatch.
|
|
*/
|
|
return !strcmp(sym + 3, name + 3);
|
|
}
|
|
#endif
|
|
|
|
#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
|
|
/*
|
|
* Some architectures that allow for 32bit applications
|
|
* to run on a 64bit kernel, do not map the syscalls for
|
|
* the 32bit tasks the same as they do for 64bit tasks.
|
|
*
|
|
* *cough*x86*cough*
|
|
*
|
|
* In such a case, instead of reporting the wrong syscalls,
|
|
* simply ignore them.
|
|
*
|
|
* For an arch to ignore the compat syscalls it needs to
|
|
* define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
|
|
* define the function arch_trace_is_compat_syscall() to let
|
|
* the tracing system know that it should ignore it.
|
|
*/
|
|
static int
|
|
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
|
|
{
|
|
if (unlikely(arch_trace_is_compat_syscall(regs)))
|
|
return -1;
|
|
|
|
return syscall_get_nr(task, regs);
|
|
}
|
|
#else
|
|
static inline int
|
|
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
|
|
{
|
|
return syscall_get_nr(task, regs);
|
|
}
|
|
#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
|
|
|
|
static __init struct syscall_metadata *
|
|
find_syscall_meta(unsigned long syscall)
|
|
{
|
|
struct syscall_metadata **start;
|
|
struct syscall_metadata **stop;
|
|
char str[KSYM_SYMBOL_LEN];
|
|
|
|
|
|
start = __start_syscalls_metadata;
|
|
stop = __stop_syscalls_metadata;
|
|
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
|
|
|
|
if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
|
|
return NULL;
|
|
|
|
for ( ; start < stop; start++) {
|
|
if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
|
|
return *start;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static struct syscall_metadata *syscall_nr_to_meta(int nr)
|
|
{
|
|
if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
|
|
return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
|
|
|
|
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
|
|
return NULL;
|
|
|
|
return syscalls_metadata[nr];
|
|
}
|
|
|
|
const char *get_syscall_name(int syscall)
|
|
{
|
|
struct syscall_metadata *entry;
|
|
|
|
entry = syscall_nr_to_meta(syscall);
|
|
if (!entry)
|
|
return NULL;
|
|
|
|
return entry->name;
|
|
}
|
|
|
|
/* Added to user strings or arrays when max limit is reached */
|
|
#define EXTRA "..."
|
|
|
|
static enum print_line_t
|
|
print_syscall_enter(struct trace_iterator *iter, int flags,
|
|
struct trace_event *event)
|
|
{
|
|
struct trace_array *tr = iter->tr;
|
|
struct trace_seq *s = &iter->seq;
|
|
struct trace_entry *ent = iter->ent;
|
|
struct syscall_trace_enter *trace;
|
|
struct syscall_metadata *entry;
|
|
int i, syscall, val, len;
|
|
unsigned char *ptr;
|
|
int offset = 0;
|
|
|
|
trace = (typeof(trace))ent;
|
|
syscall = trace->nr;
|
|
entry = syscall_nr_to_meta(syscall);
|
|
|
|
if (!entry)
|
|
goto end;
|
|
|
|
if (entry->enter_event->event.type != ent->type) {
|
|
WARN_ON_ONCE(1);
|
|
goto end;
|
|
}
|
|
|
|
trace_seq_printf(s, "%s(", entry->name);
|
|
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
bool printable = false;
|
|
char *str;
|
|
|
|
if (trace_seq_has_overflowed(s))
|
|
goto end;
|
|
|
|
if (i)
|
|
trace_seq_puts(s, ", ");
|
|
|
|
/* parameter types */
|
|
if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
|
|
trace_seq_printf(s, "%s ", entry->types[i]);
|
|
|
|
/* parameter values */
|
|
if (trace->args[i] < 10)
|
|
trace_seq_printf(s, "%s: %lu", entry->args[i],
|
|
trace->args[i]);
|
|
else
|
|
trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
|
|
trace->args[i]);
|
|
|
|
if (!(BIT(i) & entry->user_mask))
|
|
continue;
|
|
|
|
/* This arg points to a user space string */
|
|
ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
|
|
val = *(int *)ptr;
|
|
|
|
/* The value is a dynamic string (len << 16 | offset) */
|
|
ptr = (void *)ent + (val & 0xffff);
|
|
len = val >> 16;
|
|
offset += 4;
|
|
|
|
if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
|
|
trace_seq_printf(s, " \"%.*s\"", len, ptr);
|
|
continue;
|
|
}
|
|
|
|
val = trace->args[entry->user_arg_size];
|
|
|
|
str = ptr;
|
|
trace_seq_puts(s, " (");
|
|
for (int x = 0; x < len; x++, ptr++) {
|
|
if (isascii(*ptr) && isprint(*ptr))
|
|
printable = true;
|
|
if (x)
|
|
trace_seq_putc(s, ':');
|
|
trace_seq_printf(s, "%02x", *ptr);
|
|
}
|
|
if (len < val)
|
|
trace_seq_printf(s, ", %s", EXTRA);
|
|
|
|
trace_seq_putc(s, ')');
|
|
|
|
/* If nothing is printable, don't bother printing anything */
|
|
if (!printable)
|
|
continue;
|
|
|
|
trace_seq_puts(s, " \"");
|
|
for (int x = 0; x < len; x++) {
|
|
if (isascii(str[x]) && isprint(str[x]))
|
|
trace_seq_putc(s, str[x]);
|
|
else
|
|
trace_seq_putc(s, '.');
|
|
}
|
|
if (len < val)
|
|
trace_seq_printf(s, "\"%s", EXTRA);
|
|
else
|
|
trace_seq_putc(s, '"');
|
|
}
|
|
|
|
trace_seq_putc(s, ')');
|
|
end:
|
|
trace_seq_putc(s, '\n');
|
|
|
|
return trace_handle_return(s);
|
|
}
|
|
|
|
static enum print_line_t
|
|
print_syscall_exit(struct trace_iterator *iter, int flags,
|
|
struct trace_event *event)
|
|
{
|
|
struct trace_seq *s = &iter->seq;
|
|
struct trace_entry *ent = iter->ent;
|
|
struct syscall_trace_exit *trace;
|
|
int syscall;
|
|
struct syscall_metadata *entry;
|
|
|
|
trace = (typeof(trace))ent;
|
|
syscall = trace->nr;
|
|
entry = syscall_nr_to_meta(syscall);
|
|
|
|
if (!entry) {
|
|
trace_seq_putc(s, '\n');
|
|
goto out;
|
|
}
|
|
|
|
if (entry->exit_event->event.type != ent->type) {
|
|
WARN_ON_ONCE(1);
|
|
return TRACE_TYPE_UNHANDLED;
|
|
}
|
|
|
|
trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
|
|
trace->ret);
|
|
|
|
out:
|
|
return trace_handle_return(s);
|
|
}
|
|
|
|
#define SYSCALL_FIELD(_type, _name) { \
|
|
.type = #_type, .name = #_name, \
|
|
.size = sizeof(_type), .align = __alignof__(_type), \
|
|
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
|
|
|
|
static int __init
|
|
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
|
|
{
|
|
bool is_string = entry->user_arg_is_str;
|
|
int i;
|
|
int pos = 0;
|
|
|
|
/* When len=0, we just calculate the needed length */
|
|
#define LEN_OR_ZERO (len ? len - pos : 0)
|
|
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
if (i)
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
|
|
entry->args[i], sizeof(unsigned long));
|
|
|
|
if (!(BIT(i) & entry->user_mask))
|
|
continue;
|
|
|
|
/* Add the format for the user space string or array */
|
|
if (entry->user_arg_size < 0 || is_string)
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
|
|
else
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
|
|
}
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
|
|
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO,
|
|
", ((unsigned long)(REC->%s))", entry->args[i]);
|
|
if (!(BIT(i) & entry->user_mask))
|
|
continue;
|
|
/* The user space data for arg has name __<arg>_val */
|
|
if (entry->user_arg_size < 0 || is_string) {
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
|
|
entry->args[i]);
|
|
} else {
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
|
|
entry->args[i]);
|
|
}
|
|
}
|
|
|
|
#undef LEN_OR_ZERO
|
|
|
|
/* return the length of print_fmt */
|
|
return pos;
|
|
}
|
|
|
|
static int __init set_syscall_print_fmt(struct trace_event_call *call)
|
|
{
|
|
char *print_fmt;
|
|
int len;
|
|
struct syscall_metadata *entry = call->data;
|
|
|
|
if (entry->enter_event != call) {
|
|
call->print_fmt = "\"0x%lx\", REC->ret";
|
|
return 0;
|
|
}
|
|
|
|
/* First: called with 0 length to calculate the needed length */
|
|
len = __set_enter_print_fmt(entry, NULL, 0);
|
|
|
|
print_fmt = kmalloc(len + 1, GFP_KERNEL);
|
|
if (!print_fmt)
|
|
return -ENOMEM;
|
|
|
|
/* Second: actually write the @print_fmt */
|
|
__set_enter_print_fmt(entry, print_fmt, len + 1);
|
|
call->print_fmt = print_fmt;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void __init free_syscall_print_fmt(struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *entry = call->data;
|
|
|
|
if (entry->enter_event == call)
|
|
kfree(call->print_fmt);
|
|
}
|
|
|
|
static int __init syscall_enter_define_fields(struct trace_event_call *call)
|
|
{
|
|
struct syscall_trace_enter trace;
|
|
struct syscall_metadata *meta = call->data;
|
|
unsigned long mask;
|
|
char *arg;
|
|
int offset = offsetof(typeof(trace), args);
|
|
int ret = 0;
|
|
int len;
|
|
int i;
|
|
|
|
for (i = 0; i < meta->nb_args; i++) {
|
|
ret = trace_define_field(call, meta->types[i],
|
|
meta->args[i], offset,
|
|
sizeof(unsigned long), 0,
|
|
FILTER_OTHER);
|
|
if (ret)
|
|
break;
|
|
offset += sizeof(unsigned long);
|
|
}
|
|
|
|
if (ret || !meta->user_mask)
|
|
return ret;
|
|
|
|
mask = meta->user_mask;
|
|
|
|
while (mask) {
|
|
int idx = ffs(mask) - 1;
|
|
mask &= ~BIT(idx);
|
|
|
|
/*
|
|
* User space data is faulted into a temporary buffer and then
|
|
* added as a dynamic string or array to the end of the event.
|
|
* The user space data name for the arg pointer is
|
|
* "__<arg>_val".
|
|
*/
|
|
len = strlen(meta->args[idx]) + sizeof("___val");
|
|
arg = kmalloc(len, GFP_KERNEL);
|
|
if (WARN_ON_ONCE(!arg)) {
|
|
meta->user_mask = 0;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
snprintf(arg, len, "__%s_val", meta->args[idx]);
|
|
|
|
ret = trace_define_field(call, "__data_loc char[]",
|
|
arg, offset, sizeof(int), 0,
|
|
FILTER_OTHER);
|
|
if (ret) {
|
|
kfree(arg);
|
|
break;
|
|
}
|
|
offset += 4;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Create a per CPU temporary buffer to copy user space pointers into.
|
|
*
|
|
* SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
|
|
* (defined in kernel/trace/trace.h)
|
|
|
|
* SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
|
|
* nul terminating byte and possibly appended EXTRA (4 bytes).
|
|
*
|
|
* SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
|
|
* to copy memory from user space addresses into that will hold
|
|
* 3 args as only 3 args are allowed to be copied from system calls.
|
|
*/
|
|
#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
|
|
#define SYSCALL_FAULT_MAX_CNT 3
|
|
#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
|
|
|
|
/* Use the tracing per CPU buffer infrastructure to copy from user space */
|
|
struct syscall_user_buffer {
|
|
struct trace_user_buf_info buf;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
static struct syscall_user_buffer *syscall_buffer;
|
|
|
|
static int syscall_fault_buffer_enable(void)
|
|
{
|
|
struct syscall_user_buffer *sbuf;
|
|
int ret;
|
|
|
|
lockdep_assert_held(&syscall_trace_lock);
|
|
|
|
if (syscall_buffer) {
|
|
trace_user_fault_get(&syscall_buffer->buf);
|
|
return 0;
|
|
}
|
|
|
|
sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
|
|
if (!sbuf)
|
|
return -ENOMEM;
|
|
|
|
ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
|
|
if (ret < 0) {
|
|
kfree(sbuf);
|
|
return ret;
|
|
}
|
|
|
|
WRITE_ONCE(syscall_buffer, sbuf);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void rcu_free_syscall_buffer(struct rcu_head *rcu)
|
|
{
|
|
struct syscall_user_buffer *sbuf =
|
|
container_of(rcu, struct syscall_user_buffer, rcu);
|
|
|
|
trace_user_fault_destroy(&sbuf->buf);
|
|
kfree(sbuf);
|
|
}
|
|
|
|
|
|
static void syscall_fault_buffer_disable(void)
|
|
{
|
|
struct syscall_user_buffer *sbuf = syscall_buffer;
|
|
|
|
lockdep_assert_held(&syscall_trace_lock);
|
|
|
|
if (trace_user_fault_put(&sbuf->buf))
|
|
return;
|
|
|
|
WRITE_ONCE(syscall_buffer, NULL);
|
|
call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
|
|
}
|
|
|
|
struct syscall_args {
|
|
char *ptr_array[SYSCALL_FAULT_MAX_CNT];
|
|
int read[SYSCALL_FAULT_MAX_CNT];
|
|
int uargs;
|
|
};
|
|
|
|
static int syscall_copy_user(char *buf, const char __user *ptr,
|
|
size_t size, void *data)
|
|
{
|
|
struct syscall_args *args = data;
|
|
int ret;
|
|
|
|
for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
|
|
ptr = (char __user *)args->ptr_array[i];
|
|
ret = strncpy_from_user(buf, ptr, size);
|
|
args->read[i] = ret;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int syscall_copy_user_array(char *buf, const char __user *ptr,
|
|
size_t size, void *data)
|
|
{
|
|
struct syscall_args *args = data;
|
|
int ret;
|
|
|
|
for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
|
|
ptr = (char __user *)args->ptr_array[i];
|
|
ret = __copy_from_user(buf, ptr, size);
|
|
args->read[i] = ret ? -1 : size;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static char *sys_fault_user(unsigned int buf_size,
|
|
struct syscall_metadata *sys_data,
|
|
struct syscall_user_buffer *sbuf,
|
|
unsigned long *args,
|
|
unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
|
|
{
|
|
trace_user_buf_copy syscall_copy = syscall_copy_user;
|
|
unsigned long mask = sys_data->user_mask;
|
|
unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
|
|
struct syscall_args sargs;
|
|
bool array = false;
|
|
char *buffer;
|
|
char *buf;
|
|
int ret;
|
|
int i = 0;
|
|
|
|
/* The extra is appended to the user data in the buffer */
|
|
BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
|
|
SYSCALL_FAULT_ARG_SZ);
|
|
|
|
/*
|
|
* If this system call event has a size argument, use
|
|
* it to define how much of user space memory to read,
|
|
* and read it as an array and not a string.
|
|
*/
|
|
if (sys_data->user_arg_size >= 0) {
|
|
array = true;
|
|
size = args[sys_data->user_arg_size];
|
|
if (size > SYSCALL_FAULT_ARG_SZ - 1)
|
|
size = SYSCALL_FAULT_ARG_SZ - 1;
|
|
syscall_copy = syscall_copy_user_array;
|
|
}
|
|
|
|
while (mask) {
|
|
int idx = ffs(mask) - 1;
|
|
mask &= ~BIT(idx);
|
|
|
|
if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
|
|
break;
|
|
|
|
/* Get the pointer to user space memory to read */
|
|
sargs.ptr_array[i++] = (char *)args[idx];
|
|
}
|
|
|
|
sargs.uargs = i;
|
|
|
|
/* Clear the values that are not used */
|
|
for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
|
|
data_size[i] = -1; /* Denotes no pointer */
|
|
}
|
|
|
|
/* A zero size means do not even try */
|
|
if (!buf_size)
|
|
return NULL;
|
|
|
|
buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
|
|
syscall_copy, &sargs);
|
|
if (!buffer)
|
|
return NULL;
|
|
|
|
buf = buffer;
|
|
for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
|
|
|
|
ret = sargs.read[i];
|
|
if (ret < 0)
|
|
continue;
|
|
buf[ret] = '\0';
|
|
|
|
/* For strings, replace any non-printable characters with '.' */
|
|
if (!array) {
|
|
for (int x = 0; x < ret; x++) {
|
|
if (!isprint(buf[x]))
|
|
buf[x] = '.';
|
|
}
|
|
|
|
size = min(buf_size, SYSCALL_FAULT_USER_MAX);
|
|
|
|
/*
|
|
* If the text was truncated due to our max limit,
|
|
* add "..." to the string.
|
|
*/
|
|
if (ret > size) {
|
|
strscpy(buf + size, EXTRA, sizeof(EXTRA));
|
|
ret = size + sizeof(EXTRA);
|
|
} else {
|
|
buf[ret++] = '\0';
|
|
}
|
|
} else {
|
|
ret = min((unsigned int)ret, buf_size);
|
|
}
|
|
data_size[i] = ret;
|
|
}
|
|
|
|
return buffer;
|
|
}
|
|
|
|
static int
|
|
syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
|
|
char **buffer, int *size, int *user_sizes, int *uargs,
|
|
int buf_size)
|
|
{
|
|
struct syscall_user_buffer *sbuf;
|
|
int i;
|
|
|
|
/* If the syscall_buffer is NULL, tracing is being shutdown */
|
|
sbuf = READ_ONCE(syscall_buffer);
|
|
if (!sbuf)
|
|
return -1;
|
|
|
|
*buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
|
|
/*
|
|
* user_size is the amount of data to append.
|
|
* Need to add 4 for the meta field that points to
|
|
* the user memory at the end of the event and also
|
|
* stores its size.
|
|
*/
|
|
for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
|
|
if (user_sizes[i] < 0)
|
|
break;
|
|
*size += user_sizes[i] + 4;
|
|
}
|
|
/* Save the number of user read arguments of this syscall */
|
|
*uargs = i;
|
|
return 0;
|
|
}
|
|
|
|
static void syscall_put_data(struct syscall_metadata *sys_data,
|
|
struct syscall_trace_enter *entry,
|
|
char *buffer, int size, int *user_sizes, int uargs)
|
|
{
|
|
char *buf = buffer;
|
|
void *ptr;
|
|
int val;
|
|
|
|
/*
|
|
* Set the pointer to point to the meta data of the event
|
|
* that has information about the stored user space memory.
|
|
*/
|
|
ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
|
|
|
|
/*
|
|
* The meta data will store the offset of the user data from
|
|
* the beginning of the event. That is after the static arguments
|
|
* and the meta data fields.
|
|
*/
|
|
val = (ptr - (void *)entry) + 4 * uargs;
|
|
|
|
for (int i = 0; i < uargs; i++) {
|
|
|
|
if (i)
|
|
val += user_sizes[i - 1];
|
|
|
|
/* Store the offset and the size into the meta data */
|
|
*(int *)ptr = val | (user_sizes[i] << 16);
|
|
|
|
/* Skip the meta data */
|
|
ptr += 4;
|
|
}
|
|
|
|
for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
|
|
/* Nothing to do if the user space was empty or faulted */
|
|
if (!user_sizes[i])
|
|
continue;
|
|
|
|
memcpy(ptr, buf, user_sizes[i]);
|
|
ptr += user_sizes[i];
|
|
}
|
|
}
|
|
|
|
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
|
|
{
|
|
struct trace_array *tr = data;
|
|
struct trace_event_file *trace_file;
|
|
struct syscall_trace_enter *entry;
|
|
struct syscall_metadata *sys_data;
|
|
struct trace_event_buffer fbuffer;
|
|
unsigned long args[6];
|
|
char *user_ptr;
|
|
int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
|
|
int syscall_nr;
|
|
int size = 0;
|
|
int uargs = 0;
|
|
bool mayfault;
|
|
|
|
/*
|
|
* Syscall probe called with preemption enabled, but the ring
|
|
* buffer and per-cpu data require preemption to be disabled.
|
|
*/
|
|
might_fault();
|
|
|
|
syscall_nr = trace_get_syscall_nr(current, regs);
|
|
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
|
|
return;
|
|
|
|
trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
|
|
if (!trace_file)
|
|
return;
|
|
|
|
if (trace_trigger_soft_disabled(trace_file))
|
|
return;
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
if (!sys_data)
|
|
return;
|
|
|
|
/* Check if this syscall event faults in user space memory */
|
|
mayfault = sys_data->user_mask != 0;
|
|
|
|
guard(preempt_notrace)();
|
|
|
|
syscall_get_arguments(current, regs, args);
|
|
|
|
if (mayfault) {
|
|
if (syscall_get_data(sys_data, args, &user_ptr,
|
|
&size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
|
|
return;
|
|
}
|
|
|
|
size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
|
|
|
|
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
|
|
if (!entry)
|
|
return;
|
|
|
|
entry = ring_buffer_event_data(fbuffer.event);
|
|
entry->nr = syscall_nr;
|
|
|
|
memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
|
|
|
|
if (mayfault)
|
|
syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
|
|
|
|
trace_event_buffer_commit(&fbuffer);
|
|
}
|
|
|
|
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
|
|
{
|
|
struct trace_array *tr = data;
|
|
struct trace_event_file *trace_file;
|
|
struct syscall_trace_exit *entry;
|
|
struct syscall_metadata *sys_data;
|
|
struct trace_event_buffer fbuffer;
|
|
int syscall_nr;
|
|
|
|
/*
|
|
* Syscall probe called with preemption enabled, but the ring
|
|
* buffer and per-cpu data require preemption to be disabled.
|
|
*/
|
|
might_fault();
|
|
guard(preempt_notrace)();
|
|
|
|
syscall_nr = trace_get_syscall_nr(current, regs);
|
|
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
|
|
return;
|
|
|
|
trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
|
|
if (!trace_file)
|
|
return;
|
|
|
|
if (trace_trigger_soft_disabled(trace_file))
|
|
return;
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
if (!sys_data)
|
|
return;
|
|
|
|
entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
|
|
if (!entry)
|
|
return;
|
|
|
|
entry = ring_buffer_event_data(fbuffer.event);
|
|
entry->nr = syscall_nr;
|
|
entry->ret = syscall_get_return_value(current, regs);
|
|
|
|
trace_event_buffer_commit(&fbuffer);
|
|
}
|
|
|
|
static int reg_event_syscall_enter(struct trace_event_file *file,
|
|
struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *sys_data = call->data;
|
|
struct trace_array *tr = file->tr;
|
|
int ret = 0;
|
|
int num;
|
|
|
|
num = sys_data->syscall_nr;
|
|
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
|
|
return -ENOSYS;
|
|
guard(mutex)(&syscall_trace_lock);
|
|
if (sys_data->user_mask) {
|
|
ret = syscall_fault_buffer_enable();
|
|
if (ret < 0)
|
|
return ret;
|
|
}
|
|
if (!tr->sys_refcount_enter) {
|
|
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
|
|
if (ret < 0) {
|
|
if (sys_data->user_mask)
|
|
syscall_fault_buffer_disable();
|
|
return ret;
|
|
}
|
|
}
|
|
WRITE_ONCE(tr->enter_syscall_files[num], file);
|
|
tr->sys_refcount_enter++;
|
|
return 0;
|
|
}
|
|
|
|
static void unreg_event_syscall_enter(struct trace_event_file *file,
|
|
struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *sys_data = call->data;
|
|
struct trace_array *tr = file->tr;
|
|
int num;
|
|
|
|
num = sys_data->syscall_nr;
|
|
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
|
|
return;
|
|
guard(mutex)(&syscall_trace_lock);
|
|
tr->sys_refcount_enter--;
|
|
WRITE_ONCE(tr->enter_syscall_files[num], NULL);
|
|
if (!tr->sys_refcount_enter)
|
|
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
|
|
if (sys_data->user_mask)
|
|
syscall_fault_buffer_disable();
|
|
}
|
|
|
|
static int reg_event_syscall_exit(struct trace_event_file *file,
|
|
struct trace_event_call *call)
|
|
{
|
|
struct trace_array *tr = file->tr;
|
|
int ret = 0;
|
|
int num;
|
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
|
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
|
|
return -ENOSYS;
|
|
mutex_lock(&syscall_trace_lock);
|
|
if (!tr->sys_refcount_exit)
|
|
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
|
|
if (!ret) {
|
|
WRITE_ONCE(tr->exit_syscall_files[num], file);
|
|
tr->sys_refcount_exit++;
|
|
}
|
|
mutex_unlock(&syscall_trace_lock);
|
|
return ret;
|
|
}
|
|
|
|
static void unreg_event_syscall_exit(struct trace_event_file *file,
|
|
struct trace_event_call *call)
|
|
{
|
|
struct trace_array *tr = file->tr;
|
|
int num;
|
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
|
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
|
|
return;
|
|
mutex_lock(&syscall_trace_lock);
|
|
tr->sys_refcount_exit--;
|
|
WRITE_ONCE(tr->exit_syscall_files[num], NULL);
|
|
if (!tr->sys_refcount_exit)
|
|
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
|
|
mutex_unlock(&syscall_trace_lock);
|
|
}
|
|
|
|
/*
|
|
* For system calls that reference user space memory that can
|
|
* be recorded into the event, set the system call meta data's user_mask
|
|
* to the "args" index that points to the user space memory to retrieve.
|
|
*/
|
|
static void check_faultable_syscall(struct trace_event_call *call, int nr)
|
|
{
|
|
struct syscall_metadata *sys_data = call->data;
|
|
unsigned long mask;
|
|
|
|
/* Only work on entry */
|
|
if (sys_data->enter_event != call)
|
|
return;
|
|
|
|
sys_data->user_arg_size = -1;
|
|
|
|
switch (nr) {
|
|
/* user arg 1 with size arg at 2 */
|
|
case __NR_write:
|
|
case __NR_mq_timedsend:
|
|
case __NR_pwrite64:
|
|
sys_data->user_mask = BIT(1);
|
|
sys_data->user_arg_size = 2;
|
|
break;
|
|
/* user arg 0 with size arg at 1 as string */
|
|
case __NR_setdomainname:
|
|
case __NR_sethostname:
|
|
sys_data->user_mask = BIT(0);
|
|
sys_data->user_arg_size = 1;
|
|
sys_data->user_arg_is_str = 1;
|
|
break;
|
|
#ifdef __NR_kexec_file_load
|
|
/* user arg 4 with size arg at 3 as string */
|
|
case __NR_kexec_file_load:
|
|
sys_data->user_mask = BIT(4);
|
|
sys_data->user_arg_size = 3;
|
|
sys_data->user_arg_is_str = 1;
|
|
break;
|
|
#endif
|
|
/* user arg at position 0 */
|
|
#ifdef __NR_access
|
|
case __NR_access:
|
|
#endif
|
|
case __NR_acct:
|
|
case __NR_chdir:
|
|
#ifdef __NR_chown
|
|
case __NR_chown:
|
|
#endif
|
|
#ifdef __NR_chmod
|
|
case __NR_chmod:
|
|
#endif
|
|
case __NR_chroot:
|
|
#ifdef __NR_creat
|
|
case __NR_creat:
|
|
#endif
|
|
case __NR_delete_module:
|
|
case __NR_execve:
|
|
case __NR_fsopen:
|
|
#ifdef __NR_lchown
|
|
case __NR_lchown:
|
|
#endif
|
|
#ifdef __NR_open
|
|
case __NR_open:
|
|
#endif
|
|
case __NR_memfd_create:
|
|
#ifdef __NR_mkdir
|
|
case __NR_mkdir:
|
|
#endif
|
|
#ifdef __NR_mknod
|
|
case __NR_mknod:
|
|
#endif
|
|
case __NR_mq_open:
|
|
case __NR_mq_unlink:
|
|
#ifdef __NR_readlink
|
|
case __NR_readlink:
|
|
#endif
|
|
#ifdef __NR_rmdir
|
|
case __NR_rmdir:
|
|
#endif
|
|
case __NR_shmdt:
|
|
#ifdef __NR_statfs
|
|
case __NR_statfs:
|
|
#endif
|
|
case __NR_swapon:
|
|
case __NR_swapoff:
|
|
#ifdef __NR_truncate
|
|
case __NR_truncate:
|
|
#endif
|
|
#ifdef __NR_unlink
|
|
case __NR_unlink:
|
|
#endif
|
|
case __NR_umount2:
|
|
#ifdef __NR_utime
|
|
case __NR_utime:
|
|
#endif
|
|
#ifdef __NR_utimes
|
|
case __NR_utimes:
|
|
#endif
|
|
sys_data->user_mask = BIT(0);
|
|
break;
|
|
/* user arg at position 1 */
|
|
case __NR_execveat:
|
|
case __NR_faccessat:
|
|
case __NR_faccessat2:
|
|
case __NR_finit_module:
|
|
case __NR_fchmodat:
|
|
case __NR_fchmodat2:
|
|
case __NR_fchownat:
|
|
case __NR_fgetxattr:
|
|
case __NR_flistxattr:
|
|
case __NR_fsetxattr:
|
|
case __NR_fspick:
|
|
case __NR_fremovexattr:
|
|
#ifdef __NR_futimesat
|
|
case __NR_futimesat:
|
|
#endif
|
|
case __NR_inotify_add_watch:
|
|
case __NR_mkdirat:
|
|
case __NR_mknodat:
|
|
case __NR_mount_setattr:
|
|
case __NR_name_to_handle_at:
|
|
#ifdef __NR_newfstatat
|
|
case __NR_newfstatat:
|
|
#endif
|
|
case __NR_openat:
|
|
case __NR_openat2:
|
|
case __NR_open_tree:
|
|
case __NR_open_tree_attr:
|
|
case __NR_readlinkat:
|
|
case __NR_quotactl:
|
|
case __NR_syslog:
|
|
case __NR_statx:
|
|
case __NR_unlinkat:
|
|
case __NR_utimensat:
|
|
sys_data->user_mask = BIT(1);
|
|
break;
|
|
/* user arg at position 2 */
|
|
case __NR_init_module:
|
|
case __NR_fsconfig:
|
|
sys_data->user_mask = BIT(2);
|
|
break;
|
|
/* user arg at position 4 */
|
|
case __NR_fanotify_mark:
|
|
sys_data->user_mask = BIT(4);
|
|
break;
|
|
/* 2 user args, 0 and 1 */
|
|
case __NR_add_key:
|
|
case __NR_getxattr:
|
|
case __NR_lgetxattr:
|
|
case __NR_lremovexattr:
|
|
#ifdef __NR_link
|
|
case __NR_link:
|
|
#endif
|
|
case __NR_listxattr:
|
|
case __NR_llistxattr:
|
|
case __NR_lsetxattr:
|
|
case __NR_pivot_root:
|
|
case __NR_removexattr:
|
|
#ifdef __NR_rename
|
|
case __NR_rename:
|
|
#endif
|
|
case __NR_request_key:
|
|
case __NR_setxattr:
|
|
#ifdef __NR_symlink
|
|
case __NR_symlink:
|
|
#endif
|
|
sys_data->user_mask = BIT(0) | BIT(1);
|
|
break;
|
|
/* 2 user args, 0 and 2 */
|
|
case __NR_symlinkat:
|
|
sys_data->user_mask = BIT(0) | BIT(2);
|
|
break;
|
|
/* 2 user args, 1 and 3 */
|
|
case __NR_getxattrat:
|
|
case __NR_linkat:
|
|
case __NR_listxattrat:
|
|
case __NR_move_mount:
|
|
#ifdef __NR_renameat
|
|
case __NR_renameat:
|
|
#endif
|
|
case __NR_renameat2:
|
|
case __NR_removexattrat:
|
|
case __NR_setxattrat:
|
|
sys_data->user_mask = BIT(1) | BIT(3);
|
|
break;
|
|
case __NR_mount: /* Just dev_name and dir_name, TODO add type */
|
|
sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
|
|
break;
|
|
default:
|
|
sys_data->user_mask = 0;
|
|
return;
|
|
}
|
|
|
|
if (sys_data->user_arg_size < 0)
|
|
return;
|
|
|
|
/*
|
|
* The user_arg_size can only be used when the system call
|
|
* is reading only a single address from user space.
|
|
*/
|
|
mask = sys_data->user_mask;
|
|
if (WARN_ON(mask & (mask - 1)))
|
|
sys_data->user_arg_size = -1;
|
|
}
|
|
|
|
static int __init init_syscall_trace(struct trace_event_call *call)
|
|
{
|
|
int id;
|
|
int num;
|
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
|
if (num < 0 || num >= NR_syscalls) {
|
|
pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
|
|
((struct syscall_metadata *)call->data)->name);
|
|
return -ENOSYS;
|
|
}
|
|
|
|
check_faultable_syscall(call, num);
|
|
|
|
if (set_syscall_print_fmt(call) < 0)
|
|
return -ENOMEM;
|
|
|
|
id = trace_event_raw_init(call);
|
|
|
|
if (id < 0) {
|
|
free_syscall_print_fmt(call);
|
|
return id;
|
|
}
|
|
|
|
return id;
|
|
}
|
|
|
|
static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
|
|
SYSCALL_FIELD(int, __syscall_nr),
|
|
{ .type = TRACE_FUNCTION_TYPE,
|
|
.define_fields = syscall_enter_define_fields },
|
|
{}
|
|
};
|
|
|
|
struct trace_event_functions enter_syscall_print_funcs = {
|
|
.trace = print_syscall_enter,
|
|
};
|
|
|
|
struct trace_event_functions exit_syscall_print_funcs = {
|
|
.trace = print_syscall_exit,
|
|
};
|
|
|
|
struct trace_event_class __refdata event_class_syscall_enter = {
|
|
.system = "syscalls",
|
|
.reg = syscall_enter_register,
|
|
.fields_array = syscall_enter_fields_array,
|
|
.get_fields = syscall_get_enter_fields,
|
|
.raw_init = init_syscall_trace,
|
|
};
|
|
|
|
struct trace_event_class __refdata event_class_syscall_exit = {
|
|
.system = "syscalls",
|
|
.reg = syscall_exit_register,
|
|
.fields_array = (struct trace_event_fields[]){
|
|
SYSCALL_FIELD(int, __syscall_nr),
|
|
SYSCALL_FIELD(long, ret),
|
|
{}
|
|
},
|
|
.fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
|
|
.raw_init = init_syscall_trace,
|
|
};
|
|
|
|
unsigned long __init __weak arch_syscall_addr(int nr)
|
|
{
|
|
return (unsigned long)sys_call_table[nr];
|
|
}
|
|
|
|
void __init init_ftrace_syscalls(void)
|
|
{
|
|
struct syscall_metadata *meta;
|
|
unsigned long addr;
|
|
int i;
|
|
void *ret;
|
|
|
|
if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
|
|
syscalls_metadata = kcalloc(NR_syscalls,
|
|
sizeof(*syscalls_metadata),
|
|
GFP_KERNEL);
|
|
if (!syscalls_metadata) {
|
|
WARN_ON(1);
|
|
return;
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < NR_syscalls; i++) {
|
|
addr = arch_syscall_addr(i);
|
|
meta = find_syscall_meta(addr);
|
|
if (!meta)
|
|
continue;
|
|
|
|
meta->syscall_nr = i;
|
|
|
|
if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
|
|
syscalls_metadata[i] = meta;
|
|
} else {
|
|
ret = xa_store(&syscalls_metadata_sparse, i, meta,
|
|
GFP_KERNEL);
|
|
WARN(xa_is_err(ret),
|
|
"Syscall memory allocation failed\n");
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
|
|
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
|
|
static int sys_perf_refcount_enter;
|
|
static int sys_perf_refcount_exit;
|
|
|
|
static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
|
|
struct syscall_metadata *sys_data,
|
|
struct syscall_trace_enter *rec)
|
|
{
|
|
struct syscall_tp_t {
|
|
struct trace_entry ent;
|
|
int syscall_nr;
|
|
unsigned long args[SYSCALL_DEFINE_MAXARGS];
|
|
} __aligned(8) param;
|
|
int i;
|
|
|
|
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
|
|
|
|
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
|
|
perf_fetch_caller_regs(regs);
|
|
*(struct pt_regs **)¶m = regs;
|
|
param.syscall_nr = rec->nr;
|
|
for (i = 0; i < sys_data->nb_args; i++)
|
|
param.args[i] = rec->args[i];
|
|
return trace_call_bpf(call, ¶m);
|
|
}
|
|
|
|
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
|
|
{
|
|
struct syscall_metadata *sys_data;
|
|
struct syscall_trace_enter *rec;
|
|
struct pt_regs *fake_regs;
|
|
struct hlist_head *head;
|
|
unsigned long args[6];
|
|
bool valid_prog_array;
|
|
bool mayfault;
|
|
char *user_ptr;
|
|
int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
|
|
int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
|
|
int syscall_nr;
|
|
int rctx;
|
|
int size = 0;
|
|
int uargs = 0;
|
|
|
|
/*
|
|
* Syscall probe called with preemption enabled, but the ring
|
|
* buffer and per-cpu data require preemption to be disabled.
|
|
*/
|
|
might_fault();
|
|
guard(preempt_notrace)();
|
|
|
|
syscall_nr = trace_get_syscall_nr(current, regs);
|
|
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
|
|
return;
|
|
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
|
|
return;
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
if (!sys_data)
|
|
return;
|
|
|
|
syscall_get_arguments(current, regs, args);
|
|
|
|
/* Check if this syscall event faults in user space memory */
|
|
mayfault = sys_data->user_mask != 0;
|
|
|
|
if (mayfault) {
|
|
if (syscall_get_data(sys_data, args, &user_ptr,
|
|
&size, user_sizes, &uargs, buf_size) < 0)
|
|
return;
|
|
}
|
|
|
|
head = this_cpu_ptr(sys_data->enter_event->perf_events);
|
|
valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
|
|
if (!valid_prog_array && hlist_empty(head))
|
|
return;
|
|
|
|
/* get the size after alignment with the u32 buffer size field */
|
|
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
|
|
size = ALIGN(size + sizeof(u32), sizeof(u64));
|
|
size -= sizeof(u32);
|
|
|
|
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
|
|
if (!rec)
|
|
return;
|
|
|
|
rec->nr = syscall_nr;
|
|
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
|
|
|
|
if (mayfault)
|
|
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
|
|
|
|
if ((valid_prog_array &&
|
|
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
|
|
hlist_empty(head)) {
|
|
perf_swevent_put_recursion_context(rctx);
|
|
return;
|
|
}
|
|
|
|
perf_trace_buf_submit(rec, size, rctx,
|
|
sys_data->enter_event->event.type, 1, regs,
|
|
head, NULL);
|
|
}
|
|
|
|
static int perf_sysenter_enable(struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *sys_data = call->data;
|
|
int num;
|
|
int ret;
|
|
|
|
num = sys_data->syscall_nr;
|
|
|
|
guard(mutex)(&syscall_trace_lock);
|
|
if (sys_data->user_mask) {
|
|
ret = syscall_fault_buffer_enable();
|
|
if (ret < 0)
|
|
return ret;
|
|
}
|
|
if (!sys_perf_refcount_enter) {
|
|
ret = register_trace_sys_enter(perf_syscall_enter, NULL);
|
|
if (ret) {
|
|
pr_info("event trace: Could not activate syscall entry trace point");
|
|
if (sys_data->user_mask)
|
|
syscall_fault_buffer_disable();
|
|
return ret;
|
|
}
|
|
}
|
|
set_bit(num, enabled_perf_enter_syscalls);
|
|
sys_perf_refcount_enter++;
|
|
return 0;
|
|
}
|
|
|
|
static void perf_sysenter_disable(struct trace_event_call *call)
|
|
{
|
|
struct syscall_metadata *sys_data = call->data;
|
|
int num;
|
|
|
|
num = sys_data->syscall_nr;
|
|
|
|
guard(mutex)(&syscall_trace_lock);
|
|
sys_perf_refcount_enter--;
|
|
clear_bit(num, enabled_perf_enter_syscalls);
|
|
if (!sys_perf_refcount_enter)
|
|
unregister_trace_sys_enter(perf_syscall_enter, NULL);
|
|
if (sys_data->user_mask)
|
|
syscall_fault_buffer_disable();
|
|
}
|
|
|
|
static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
|
|
struct syscall_trace_exit *rec)
|
|
{
|
|
struct syscall_tp_t {
|
|
struct trace_entry ent;
|
|
int syscall_nr;
|
|
unsigned long ret;
|
|
} __aligned(8) param;
|
|
|
|
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
|
|
perf_fetch_caller_regs(regs);
|
|
*(struct pt_regs **)¶m = regs;
|
|
param.syscall_nr = rec->nr;
|
|
param.ret = rec->ret;
|
|
return trace_call_bpf(call, ¶m);
|
|
}
|
|
|
|
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
|
|
{
|
|
struct syscall_metadata *sys_data;
|
|
struct syscall_trace_exit *rec;
|
|
struct pt_regs *fake_regs;
|
|
struct hlist_head *head;
|
|
bool valid_prog_array;
|
|
int syscall_nr;
|
|
int rctx;
|
|
int size;
|
|
|
|
/*
|
|
* Syscall probe called with preemption enabled, but the ring
|
|
* buffer and per-cpu data require preemption to be disabled.
|
|
*/
|
|
might_fault();
|
|
guard(preempt_notrace)();
|
|
|
|
syscall_nr = trace_get_syscall_nr(current, regs);
|
|
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
|
|
return;
|
|
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
|
|
return;
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
if (!sys_data)
|
|
return;
|
|
|
|
head = this_cpu_ptr(sys_data->exit_event->perf_events);
|
|
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
|
|
if (!valid_prog_array && hlist_empty(head))
|
|
return;
|
|
|
|
/* We can probably do that at build time */
|
|
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
|
|
size -= sizeof(u32);
|
|
|
|
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
|
|
if (!rec)
|
|
return;
|
|
|
|
rec->nr = syscall_nr;
|
|
rec->ret = syscall_get_return_value(current, regs);
|
|
|
|
if ((valid_prog_array &&
|
|
!perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
|
|
hlist_empty(head)) {
|
|
perf_swevent_put_recursion_context(rctx);
|
|
return;
|
|
}
|
|
|
|
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
|
|
1, regs, head, NULL);
|
|
}
|
|
|
|
static int perf_sysexit_enable(struct trace_event_call *call)
|
|
{
|
|
int num;
|
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
|
|
|
guard(mutex)(&syscall_trace_lock);
|
|
if (!sys_perf_refcount_exit) {
|
|
int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
|
|
if (ret) {
|
|
pr_info("event trace: Could not activate syscall exit trace point");
|
|
return ret;
|
|
}
|
|
}
|
|
set_bit(num, enabled_perf_exit_syscalls);
|
|
sys_perf_refcount_exit++;
|
|
return 0;
|
|
}
|
|
|
|
static void perf_sysexit_disable(struct trace_event_call *call)
|
|
{
|
|
int num;
|
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
|
|
|
guard(mutex)(&syscall_trace_lock);
|
|
sys_perf_refcount_exit--;
|
|
clear_bit(num, enabled_perf_exit_syscalls);
|
|
if (!sys_perf_refcount_exit)
|
|
unregister_trace_sys_exit(perf_syscall_exit, NULL);
|
|
}
|
|
|
|
#endif /* CONFIG_PERF_EVENTS */
|
|
|
|
static int syscall_enter_register(struct trace_event_call *event,
|
|
enum trace_reg type, void *data)
|
|
{
|
|
struct trace_event_file *file = data;
|
|
|
|
switch (type) {
|
|
case TRACE_REG_REGISTER:
|
|
return reg_event_syscall_enter(file, event);
|
|
case TRACE_REG_UNREGISTER:
|
|
unreg_event_syscall_enter(file, event);
|
|
return 0;
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
case TRACE_REG_PERF_REGISTER:
|
|
return perf_sysenter_enable(event);
|
|
case TRACE_REG_PERF_UNREGISTER:
|
|
perf_sysenter_disable(event);
|
|
return 0;
|
|
case TRACE_REG_PERF_OPEN:
|
|
case TRACE_REG_PERF_CLOSE:
|
|
case TRACE_REG_PERF_ADD:
|
|
case TRACE_REG_PERF_DEL:
|
|
return 0;
|
|
#endif
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int syscall_exit_register(struct trace_event_call *event,
|
|
enum trace_reg type, void *data)
|
|
{
|
|
struct trace_event_file *file = data;
|
|
|
|
switch (type) {
|
|
case TRACE_REG_REGISTER:
|
|
return reg_event_syscall_exit(file, event);
|
|
case TRACE_REG_UNREGISTER:
|
|
unreg_event_syscall_exit(file, event);
|
|
return 0;
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
case TRACE_REG_PERF_REGISTER:
|
|
return perf_sysexit_enable(event);
|
|
case TRACE_REG_PERF_UNREGISTER:
|
|
perf_sysexit_disable(event);
|
|
return 0;
|
|
case TRACE_REG_PERF_OPEN:
|
|
case TRACE_REG_PERF_CLOSE:
|
|
case TRACE_REG_PERF_ADD:
|
|
case TRACE_REG_PERF_DEL:
|
|
return 0;
|
|
#endif
|
|
}
|
|
return 0;
|
|
}
|