linux/kernel/nscommon.c

291 lines
8.3 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
case CLONE_NEWCGROUP:
VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
break;
#endif
#ifdef CONFIG_IPC_NS
case CLONE_NEWIPC:
VFS_WARN_ON_ONCE(ops != &ipcns_operations);
break;
#endif
case CLONE_NEWNS:
VFS_WARN_ON_ONCE(ops != &mntns_operations);
break;
#ifdef CONFIG_NET_NS
case CLONE_NEWNET:
VFS_WARN_ON_ONCE(ops != &netns_operations);
break;
#endif
#ifdef CONFIG_PID_NS
case CLONE_NEWPID:
VFS_WARN_ON_ONCE(ops != &pidns_operations);
break;
#endif
#ifdef CONFIG_TIME_NS
case CLONE_NEWTIME:
VFS_WARN_ON_ONCE(ops != &timens_operations);
break;
#endif
#ifdef CONFIG_USER_NS
case CLONE_NEWUSER:
VFS_WARN_ON_ONCE(ops != &userns_operations);
break;
#endif
#ifdef CONFIG_UTS_NS
case CLONE_NEWUTS:
VFS_WARN_ON_ONCE(ops != &utsns_operations);
break;
#endif
}
}
#endif
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
int ret;
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
RB_CLEAR_NODE(&ns->ns_tree_node);
INIT_LIST_HEAD(&ns->ns_list_node);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
if (inum) {
ns->inum = inum;
return 0;
}
ret = proc_alloc_inum(&ns->inum);
if (ret)
return ret;
/*
* Tree ref starts at 0. It's incremented when namespace enters
* active use (installed in nsproxy) and decremented when all
* active uses are gone. Initial namespaces are always active.
*/
if (is_initial_namespace(ns))
atomic_set(&ns->__ns_ref_active, 1);
else
atomic_set(&ns->__ns_ref_active, 0);
return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
static struct ns_common *ns_owner(struct ns_common *ns)
{
struct user_namespace *owner;
if (unlikely(!ns->ops))
return NULL;
VFS_WARN_ON_ONCE(!ns->ops->owner);
owner = ns->ops->owner(ns);
VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
if (!owner)
return NULL;
/* Skip init_user_ns as it's always active */
if (owner == &init_user_ns)
return NULL;
return to_ns_common(owner);
}
void __ns_ref_active_get_owner(struct ns_common *ns)
{
ns = ns_owner(ns);
if (ns)
WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down.
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* The iteration stops once we reach a namespace that still has active
* references.
*/
void __ns_ref_active_put_owner(struct ns_common *ns)
{
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
if (!atomic_dec_and_test(&ns->__ns_ref_active))
return;
}
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down. This makes it possible to efficiently
* resurrect a namespace tree:
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Assume the whole tree is dead but all namespaces are still active:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - - -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
*
* net_ns pid_ns
* \ /
* + -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - + -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If net_ns had a zero reference count and we bumped it we also need to
* take another reference on its owning user namespace. Similarly, if
* pid_ns had a zero reference count it also needs to take another
* reference on its owning user namespace. So both net_ns and pid_ns
* will each have their own reference on the owning user namespace.
*
* If the owning user namespace user_ns1 had a zero reference count then
* it also needs to take another reference on its owning user namespace
* and so on.
*/
void __ns_ref_active_resurrect(struct ns_common *ns)
{
/* If we didn't resurrect the namespace we're done. */
if (atomic_fetch_add(1, &ns->__ns_ref_active))
return;
/*
* We did resurrect it. Walk the ownership hierarchy upwards
* until we found an owning user namespace that is active.
*/
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
if (atomic_fetch_add(1, &ns->__ns_ref_active))
return;
}
}