diff --git a/fs/nsfs.c b/fs/nsfs.c index 8b53fd361177..0c35e4e54711 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = { static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; + + __ns_ref_active_put(ns); clear_inode(inode); ns->ops->put(ns); } @@ -419,6 +421,16 @@ static int nsfs_init_inode(struct inode *inode, void *data) inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; + + /* + * Bring the namespace subtree back to life if we have to. This + * can happen when e.g., all processes using a network namespace + * and all namespace files or namespace file bind-mounts have + * died but there are still sockets pinning it. The SIOCGSKNS + * ioctl on such a socket will resurrect the relevant namespace + * subtree. + */ + __ns_ref_active_resurrect(ns); return 0; } @@ -495,7 +507,17 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, if (ns->inum != fid->ns_inum) return NULL; - if (!__ns_ref_get(ns)) + /* + * This is racy because we're not actually taking an + * active reference. IOW, it could happen that the + * namespace becomes inactive after this check. + * We don't care because nsfs_init_inode() will just + * resurrect the relevant namespace tree for us. If it + * has been active here we just allow it's resurrection. + * We could try to take an active reference here and + * then drop it again. But really, why bother. + */ + if (!ns_get_unless_inactive(ns)) return NULL; } @@ -615,3 +637,27 @@ void __init nsfs_init(void) nsfs_root_path.mnt = nsfs_mnt; nsfs_root_path.dentry = nsfs_mnt->mnt_root; } + +void nsproxy_ns_active_get(struct nsproxy *ns) +{ + ns_ref_active_get(ns->mnt_ns); + ns_ref_active_get(ns->uts_ns); + ns_ref_active_get(ns->ipc_ns); + ns_ref_active_get(ns->pid_ns_for_children); + ns_ref_active_get(ns->cgroup_ns); + ns_ref_active_get(ns->net_ns); + ns_ref_active_get(ns->time_ns); + ns_ref_active_get(ns->time_ns_for_children); +} + +void nsproxy_ns_active_put(struct nsproxy *ns) +{ + ns_ref_active_put(ns->mnt_ns); + ns_ref_active_put(ns->uts_ns); + ns_ref_active_put(ns->ipc_ns); + ns_ref_active_put(ns->pid_ns_for_children); + ns_ref_active_put(ns->cgroup_ns); + ns_ref_active_put(ns->net_ns); + ns_ref_active_put(ns->time_ns); + ns_ref_active_put(ns->time_ns_for_children); +} diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 5e09facafd93..bdd0df15ad9c 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include struct proc_ns_operations; @@ -37,6 +39,67 @@ extern const struct proc_ns_operations cgroupns_operations; extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc//ns/, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + */ struct ns_common { u32 ns_type; struct dentry *stashed; @@ -48,6 +111,7 @@ struct ns_common { u64 ns_id; struct rb_node ns_tree_node; struct list_head ns_list_node; + atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; }; @@ -56,6 +120,13 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +static __always_inline bool is_initial_namespace(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->inum == 0); + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -127,6 +198,7 @@ void __ns_common_free(struct ns_common *ns); .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref_active = ATOMIC_INIT(1), \ .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ } @@ -144,14 +216,26 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) +{ + return atomic_read(&ns->__ns_ref_active); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->__ns_ref); + if (refcount_dec_and_test(&ns->__ns_ref)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return true; + } + return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->__ns_ref); + if (refcount_inc_not_zero(&ns->__ns_ref)) + return true; + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return false; } static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) @@ -166,4 +250,57 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_put_and_lock(__ns, __lock) \ refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +#define ns_ref_active_read(__ns) \ + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) + +void __ns_ref_active_get_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_get(struct ns_common *ns) +{ + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); + VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); +} +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) + +static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) +{ + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + return true; + } + return false; +} + +#define ns_ref_active_get_owner(__ns) \ + do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) + +void __ns_ref_active_put_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_put(struct ns_common *ns) +{ + if (atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(is_initial_namespace(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + __ns_ref_active_put_owner(ns); + } +} +#define ns_ref_active_put(__ns) \ + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) + +static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); + if (!__ns_ref_active_read(ns)) + return NULL; + if (!__ns_ref_get(ns)) + return NULL; + return ns; +} + +void __ns_ref_active_resurrect(struct ns_common *ns); + +#define ns_ref_active_resurrect(__ns) \ + do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) + #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index e5a5fa83d36b..731b67fc2fec 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -37,4 +37,7 @@ void nsfs_init(void); #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +void nsproxy_ns_active_get(struct nsproxy *ns); +void nsproxy_ns_active_put(struct nsproxy *ns); + #endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 538ba8dba184..ac825eddec59 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,7 +93,10 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); +void switch_cred_namespaces(const struct cred *old, const struct cred *new); void exit_nsproxy_namespaces(struct task_struct *tsk); +void get_cred_namespaces(struct task_struct *tsk); +void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6e7f580df14 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -306,6 +306,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +344,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +438,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/exit.c b/kernel/exit.c index 825998103520..988e16efd66b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -291,6 +291,7 @@ void release_task(struct task_struct *p) write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); diff --git a/kernel/fork.c b/kernel/fork.c index 0926bfe4b8df..f1857672426e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2487,6 +2487,7 @@ __latent_entropy struct task_struct *copy_process( delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 238402b189f7..abd1ac1a2d02 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -3,6 +3,7 @@ #include #include +#include #include #ifdef CONFIG_DEBUG_VFS @@ -53,6 +54,8 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; @@ -69,10 +72,219 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns->inum = inum; return 0; } - return proc_alloc_inum(&ns->inum); + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_initial_namespace(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +static struct ns_common *ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +void __ns_ref_active_get_owner(struct ns_common *ns) +{ + ns = ns_owner(ns); + if (ns) + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put_owner(struct ns_common *ns) +{ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + if (!atomic_dec_and_test(&ns->__ns_ref_active)) + return; + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_resurrect(struct ns_common *ns) +{ + /* If we didn't resurrect the namespace we're done. */ + if (atomic_fetch_add(1, &ns->__ns_ref_active)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + if (atomic_fetch_add(1, &ns->__ns_ref_active)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 6ce76a0278ab..94c2cfe0afa1 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -179,12 +180,15 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } void free_nsproxy(struct nsproxy *ns) { + nsproxy_ns_active_put(ns); + put_mnt_ns(ns->mnt_ns); put_uts_ns(ns->uts_ns); put_ipc_ns(ns->ipc_ns); @@ -232,6 +236,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -246,6 +253,22 @@ void exit_nsproxy_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; diff --git a/kernel/nstree.c b/kernel/nstree.c index 4eabab5fceaf..e2a537785128 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -123,6 +123,14 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree->ns_tree_lock); VFS_WARN_ON_ONCE(node); + + /* + * Take an active reference on the owner namespace. This ensures + * that the owner remains visible while any of its child namespaces + * are active. For init namespaces this is a no-op as ns_owner() + * returns NULL for namespaces owned by init_user_ns. + */ + __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) diff --git a/kernel/pid.c b/kernel/pid.c index 19d4599c136c..a5a63dc0a491 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -112,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -278,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid;