From c6269149cbf7053272d918101981869438ff7c1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:39 +0200 Subject: [PATCH 1/5] file: add take_fd() cleanup helper Add a helper that returns the file descriptor and ensures that the old variable contains a negative value. This makes it easy to rely on CLASS(get_unused_fd). Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-1-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 13 ++++++++----- include/linux/file.h | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index c2d09bc4f976..80c4181e194a 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -63,17 +63,20 @@ #define __free(_name) __cleanup(__free_##_name) -#define __get_and_null_ptr(p) \ - ({ __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = NULL; __val; }) +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) static inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null_ptr(p))) + ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) diff --git a/include/linux/file.h b/include/linux/file.h index 45d0f4800abd..237931f20739 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -97,6 +97,26 @@ extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +/* + * take_fd() will take care to set @fd to -EBADF ensuring that + * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it + * easier to rely on CLASS(get_unused_fd): + * + * struct file *f; + * + * CLASS(get_unused_fd, fd)(O_CLOEXEC); + * if (fd < 0) + * return fd; + * + * f = dentry_open(&path, O_RDONLY, current_cred()); + * if (IS_ERR(f)) + * return PTR_ERR(fd); + * + * fd_install(fd, f); + * return take_fd(fd); + */ +#define take_fd(fd) __get_and_null(fd, -EBADF) + extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); From d057c108155ab8d02b603c7ee5da7df615c2f32f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:40 +0200 Subject: [PATCH 2/5] nsproxy: add a cleanup helper for nsproxy Add a simple cleanup helper for nsproxy. Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-2-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 5601d14e2886..e6bec522b139 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -112,4 +112,6 @@ static inline void get_nsproxy(struct nsproxy *ns) refcount_inc(&ns->count); } +DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) + #endif From 85e4daaeb70bc68ec920e0c268eb428113d31b21 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 28 Jun 2024 10:05:19 +0200 Subject: [PATCH 3/5] nsproxy: add helper to go from arbitrary namespace to ns_common They all contains struct ns_common ns and if there ever is one where that isn't the case we'll catch it here at build time. Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index e6bec522b139..dab6a1734a22 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,6 +42,17 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns->ns), \ + struct ipc_namespace *: &(__ns->ns), \ + struct net *: &(__ns->ns), \ + struct pid_namespace *: &(__ns->ns), \ + struct mnt_namespace *: &(__ns->ns), \ + struct time_namespace *: &(__ns->ns), \ + struct user_namespace *: &(__ns->ns), \ + struct uts_namespace *: &(__ns->ns)) + /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. From 460695a294e63de4e4f30ad1679ffddeec932bb8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:41 +0200 Subject: [PATCH 4/5] nsfs: add open_namespace() and call it from open_related_ns(). Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-3-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/internal.h | 2 ++ fs/nsfs.c | 57 +++++++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index ab2225136f60..24346cf765dd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -17,6 +17,7 @@ struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; +struct ns_common; /* * block/bdev.c @@ -321,3 +322,4 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +int open_namespace(struct ns_common *ns); diff --git a/fs/nsfs.c b/fs/nsfs.c index 07e22a15ef02..4c4dbdce7601 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -82,40 +82,47 @@ int ns_get_path(struct path *path, struct task_struct *task, return ns_get_path_cb(path, ns_get_path_task, &args); } -int open_related_ns(struct ns_common *ns, - struct ns_common *(*get_ns)(struct ns_common *ns)) +/** + * open_namespace - open a namespace + * @ns: the namespace to open + * + * This will consume a reference to @ns indendent of success or failure. + * + * Return: A file descriptor on success or a negative error code on failure. + */ +int open_namespace(struct ns_common *ns) { - struct path path = {}; - struct ns_common *relative; + struct path path __free(path_put) = {}; struct file *f; int err; - int fd; - fd = get_unused_fd_flags(O_CLOEXEC); + /* call first to consume reference */ + err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (err < 0) + return err; + + CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; - relative = get_ns(ns); - if (IS_ERR(relative)) { - put_unused_fd(fd); - return PTR_ERR(relative); - } - - err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path); - if (err < 0) { - put_unused_fd(fd); - return err; - } - f = dentry_open(&path, O_RDONLY, current_cred()); - path_put(&path); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else - fd_install(fd, f); + if (IS_ERR(f)) + return PTR_ERR(f); - return fd; + fd_install(fd, f); + return take_fd(fd); +} + +int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)) +{ + struct ns_common *relative; + + relative = get_ns(ns); + if (IS_ERR(relative)) + return PTR_ERR(relative); + + return open_namespace(relative); } EXPORT_SYMBOL_GPL(open_related_ns); From 5b08bd408534bfb3a7cf5778da5b27d4e4fffe12 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 27 Jun 2024 16:11:42 +0200 Subject: [PATCH 5/5] pidfs: allow retrieval of namespace file descriptors For users that hold a reference to a pidfd procfs might not even be available nor is it desirable to parse through procfs just for the sake of getting namespace file descriptors for a process. Make it possible to directly retrieve namespace file descriptors from a pidfd. Pidfds already can be used with setns() to change a set of namespaces atomically. Link: https://lore.kernel.org/r/20240627-work-pidfs-v1-4-7e9ab6cc3bb1@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Josef Bacik Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 90 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/pidfd.h | 14 ++++++ 2 files changed, 104 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index dbb9d854d1c5..c9cb14181def 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -11,10 +11,16 @@ #include #include #include +#include #include #include +#include +#include +#include +#include #include "internal.h" +#include "mount.h" #ifdef CONFIG_PROC_FS /** @@ -108,11 +114,95 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } +static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct task_struct *task __free(put_task) = NULL; + struct nsproxy *nsp __free(put_nsproxy) = NULL; + struct pid *pid = pidfd_pid(file); + struct ns_common *ns_common; + + if (arg) + return -EINVAL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + scoped_guard(task_lock, task) { + nsp = task->nsproxy; + if (nsp) + get_nsproxy(nsp); + } + if (!nsp) + return -ESRCH; /* just pretend it didn't exist */ + + /* + * We're trying to open a file descriptor to the namespace so perform a + * filesystem cred ptrace check. Also, we mirror nsfs behavior. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + return -EACCES; + + switch (cmd) { + /* Namespaces that hang of nsproxy. */ + case PIDFD_GET_CGROUP_NAMESPACE: + get_cgroup_ns(nsp->cgroup_ns); + ns_common = to_ns_common(nsp->cgroup_ns); + break; + case PIDFD_GET_IPC_NAMESPACE: + get_ipc_ns(nsp->ipc_ns); + ns_common = to_ns_common(nsp->ipc_ns); + break; + case PIDFD_GET_MNT_NAMESPACE: + get_mnt_ns(nsp->mnt_ns); + ns_common = to_ns_common(nsp->mnt_ns); + break; + case PIDFD_GET_NET_NAMESPACE: + ns_common = to_ns_common(nsp->net_ns); + get_net_ns(ns_common); + break; + case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: + get_pid_ns(nsp->pid_ns_for_children); + ns_common = to_ns_common(nsp->pid_ns_for_children); + break; + case PIDFD_GET_TIME_NAMESPACE: + get_time_ns(nsp->time_ns); + ns_common = to_ns_common(nsp->time_ns); + break; + case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: + get_time_ns(nsp->time_ns_for_children); + ns_common = to_ns_common(nsp->time_ns_for_children); + break; + case PIDFD_GET_UTS_NAMESPACE: + get_uts_ns(nsp->uts_ns); + ns_common = to_ns_common(nsp->uts_ns); + break; + /* Namespaces that don't hang of nsproxy. */ + case PIDFD_GET_USER_NAMESPACE: + rcu_read_lock(); + ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); + rcu_read_unlock(); + break; + case PIDFD_GET_PID_NAMESPACE: + rcu_read_lock(); + ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task))); + rcu_read_unlock(); + break; + default: + return -ENOIOCTLCMD; + } + + /* open_namespace() unconditionally consumes the reference */ + return open_namespace(ns_common); +} + static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif + .unlocked_ioctl = pidfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; struct pid *pidfd_pid(const struct file *file) diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 72ec000a97cd..565fc0629fff 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -5,6 +5,7 @@ #include #include +#include /* Flags for pidfd_open(). */ #define PIDFD_NONBLOCK O_NONBLOCK @@ -15,4 +16,17 @@ #define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1) #define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2) +#define PIDFS_IOCTL_MAGIC 0xFF + +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) + #endif /* _UAPI_LINUX_PIDFD_H */