mirror of https://github.com/torvalds/linux.git
182 lines
4.5 KiB
C
182 lines
4.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
|
|
*/
|
|
|
|
#include <sysdep/stub.h>
|
|
|
|
#include <linux/futex.h>
|
|
#include <sys/socket.h>
|
|
#include <errno.h>
|
|
|
|
/*
|
|
* Known security issues
|
|
*
|
|
* Userspace can jump to this address to execute *any* syscall that is
|
|
* permitted by the stub. As we will return afterwards, it can do
|
|
* whatever it likes, including:
|
|
* - Tricking the kernel into handing out the memory FD
|
|
* - Using this memory FD to read/write all physical memory
|
|
* - Running in parallel to the kernel processing a syscall
|
|
* (possibly creating data races?)
|
|
* - Blocking e.g. SIGALRM to avoid time based scheduling
|
|
*
|
|
* To avoid this, the permitted location for each syscall needs to be
|
|
* checked for in the SECCOMP filter (which is reasonably simple). Also,
|
|
* more care will need to go into considerations how the code might be
|
|
* tricked by using a prepared stack (or even modifying the stack from
|
|
* another thread in case SMP support is added).
|
|
*
|
|
* As for the SIGALRM, the best counter measure will be to check in the
|
|
* kernel that the process is reporting back the SIGALRM in a timely
|
|
* fashion.
|
|
*/
|
|
static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
|
|
{
|
|
struct stub_data *d = get_stub_data();
|
|
int i;
|
|
unsigned long res;
|
|
int fd;
|
|
|
|
for (i = 0; i < d->syscall_data_len; i++) {
|
|
struct stub_syscall *sc = &d->syscall_data[i];
|
|
|
|
switch (sc->syscall) {
|
|
case STUB_SYSCALL_MMAP:
|
|
if (fd_map)
|
|
fd = fd_map[sc->mem.fd];
|
|
else
|
|
fd = sc->mem.fd;
|
|
|
|
res = stub_syscall6(STUB_MMAP_NR,
|
|
sc->mem.addr, sc->mem.length,
|
|
sc->mem.prot,
|
|
MAP_SHARED | MAP_FIXED,
|
|
fd, sc->mem.offset);
|
|
if (res != sc->mem.addr) {
|
|
d->err = res;
|
|
d->syscall_data_len = i;
|
|
return -1;
|
|
}
|
|
break;
|
|
case STUB_SYSCALL_MUNMAP:
|
|
res = stub_syscall2(__NR_munmap,
|
|
sc->mem.addr, sc->mem.length);
|
|
if (res) {
|
|
d->err = res;
|
|
d->syscall_data_len = i;
|
|
return -1;
|
|
}
|
|
break;
|
|
default:
|
|
d->err = -95; /* EOPNOTSUPP */
|
|
d->syscall_data_len = i;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
d->err = 0;
|
|
d->syscall_data_len = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void __section(".__syscall_stub")
|
|
stub_syscall_handler(void)
|
|
{
|
|
syscall_handler(NULL);
|
|
|
|
trap_myself();
|
|
}
|
|
|
|
void __section(".__syscall_stub")
|
|
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
|
|
{
|
|
struct stub_data *d = get_stub_data();
|
|
char rcv_data;
|
|
union {
|
|
char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
|
|
struct cmsghdr align;
|
|
} ctrl = {};
|
|
struct iovec iov = {
|
|
.iov_base = &rcv_data,
|
|
.iov_len = 1,
|
|
};
|
|
struct msghdr msghdr = {
|
|
.msg_iov = &iov,
|
|
.msg_iovlen = 1,
|
|
.msg_control = &ctrl,
|
|
.msg_controllen = sizeof(ctrl),
|
|
};
|
|
ucontext_t *uc = p;
|
|
struct cmsghdr *fd_msg;
|
|
int *fd_map;
|
|
int num_fds;
|
|
long res;
|
|
|
|
d->signal = sig;
|
|
d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
|
|
d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
|
|
|
|
restart_wait:
|
|
d->futex = FUTEX_IN_KERN;
|
|
do {
|
|
res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
|
|
FUTEX_WAKE, 1);
|
|
} while (res == -EINTR);
|
|
|
|
do {
|
|
res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
|
|
FUTEX_WAIT, FUTEX_IN_KERN, 0);
|
|
} while (res == -EINTR || d->futex == FUTEX_IN_KERN);
|
|
|
|
if (res < 0 && res != -EAGAIN)
|
|
stub_syscall1(__NR_exit_group, 1);
|
|
|
|
if (d->syscall_data_len) {
|
|
/* Read passed FDs (if any) */
|
|
do {
|
|
res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
|
|
} while (res == -EINTR);
|
|
|
|
/* We should never have a receive error (other than -EAGAIN) */
|
|
if (res < 0 && res != -EAGAIN)
|
|
stub_syscall1(__NR_exit_group, 1);
|
|
|
|
/* Receive the FDs */
|
|
num_fds = 0;
|
|
fd_msg = msghdr.msg_control;
|
|
fd_map = (void *)&CMSG_DATA(fd_msg);
|
|
if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
|
|
num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
|
|
|
|
/* Try running queued syscalls. */
|
|
res = syscall_handler(fd_map);
|
|
|
|
while (num_fds)
|
|
stub_syscall2(__NR_close, fd_map[--num_fds], 0);
|
|
} else {
|
|
res = 0;
|
|
}
|
|
|
|
if (res < 0 || d->restart_wait) {
|
|
/* Report SIGSYS if we restart. */
|
|
d->signal = SIGSYS;
|
|
d->restart_wait = 0;
|
|
|
|
goto restart_wait;
|
|
}
|
|
|
|
/* Restore arch dependent state that is not part of the mcontext */
|
|
stub_seccomp_restore_state(&d->arch_data);
|
|
|
|
/* Return so that the host modified mcontext is restored. */
|
|
}
|
|
|
|
void __section(".__syscall_stub")
|
|
stub_signal_restorer(void)
|
|
{
|
|
/* We must not have anything on the stack when doing rt_sigreturn */
|
|
stub_syscall0(__NR_rt_sigreturn);
|
|
}
|