From 094e94d13b606b820e3d1383e3a361f680ff023a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 18 Sep 2025 12:04:34 +1000 Subject: [PATCH 1/4] memfd,selinux: call security_inode_init_security_anon() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to this change, no security hooks were called at the creation of a memfd file. It means that, for SELinux as an example, it will receive the default type of the filesystem that backs the in-memory inode. In most cases, that would be tmpfs, but if MFD_HUGETLB is passed, it will be hugetlbfs. Both can be considered implementation details of memfd. It also means that it is not possible to differentiate between a file coming from memfd_create and a file coming from a standard tmpfs mount point. Additionally, no permission is validated at creation, which differs from the similar memfd_secret syscall. Call security_inode_init_security_anon during creation. This ensures that the file is setup similarly to other anonymous inodes. On SELinux, it means that the file will receive the security context of its task. The ability to limit fexecve on memfd has been of interest to avoid potential pitfalls where /proc/self/exe or similar would be executed [1][2]. Reuse the "execute_no_trans" and "entrypoint" access vectors, similarly to the file class. These access vectors may not make sense for the existing "anon_inode" class. Therefore, define and assign a new class "memfd_file" to support such access vectors. Guard these changes behind a new policy capability named "memfd_class". [1] https://crbug.com/1305267 [2] https://lore.kernel.org/lkml/20221215001205.51969-1-jeffxu@google.com/ Signed-off-by: ThiƩbaud Weksteen Reviewed-by: Stephen Smalley Tested-by: Stephen Smalley Acked-by: Hugh Dickins [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/memfd.h | 2 ++ mm/memfd.c | 14 ++++++++++-- security/selinux/hooks.c | 26 +++++++++++++++++----- security/selinux/include/classmap.h | 2 ++ security/selinux/include/policycap.h | 1 + security/selinux/include/policycap_names.h | 1 + security/selinux/include/security.h | 5 +++++ 7 files changed, 44 insertions(+), 7 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 6f606d9573c3..cc74de3dbcfe 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -4,6 +4,8 @@ #include +#define MEMFD_ANON_NAME "[memfd]" + #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); diff --git a/mm/memfd.c b/mm/memfd.c index 1d109c1acf21..a61acbe5ded3 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -433,6 +433,8 @@ static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; + struct inode *inode; + int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, @@ -444,12 +446,20 @@ static struct file *alloc_file(const char *name, unsigned int flags) } if (IS_ERR(file)) return file; + + inode = file_inode(file); + err = security_inode_init_security_anon(inode, + &QSTR(MEMFD_ANON_NAME), NULL); + if (err) { + fput(file); + file = ERR_PTR(err); + return file; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { - struct inode *inode = file_inode(file); - inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dfc22da42f30..a22b1920242f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -93,6 +93,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -2319,6 +2320,10 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) new_tsec = selinux_cred(bprm->cred); isec = inode_security(inode); + if (WARN_ON(isec->sclass != SECCLASS_FILE && + isec->sclass != SECCLASS_MEMFD_FILE)) + return -EACCES; + /* Default to the current task SID. */ new_tsec->sid = old_tsec->sid; new_tsec->osid = old_tsec->sid; @@ -2371,8 +2376,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { - rc = avc_has_perm(old_tsec->sid, isec->sid, - SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); + rc = avc_has_perm(old_tsec->sid, isec->sid, isec->sclass, + FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { @@ -2382,8 +2387,8 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) if (rc) return rc; - rc = avc_has_perm(new_tsec->sid, isec->sid, - SECCLASS_FILE, FILE__ENTRYPOINT, &ad); + rc = avc_has_perm(new_tsec->sid, isec->sid, isec->sclass, + FILE__ENTRYPOINT, &ad); if (rc) return rc; @@ -2978,10 +2983,18 @@ static int selinux_inode_init_security_anon(struct inode *inode, struct common_audit_data ad; struct inode_security_struct *isec; int rc; + bool is_memfd = false; if (unlikely(!selinux_initialized())) return 0; + if (name != NULL && name->name != NULL && + !strcmp(name->name, MEMFD_ANON_NAME)) { + if (!selinux_policycap_memfd_class()) + return 0; + is_memfd = true; + } + isec = selinux_inode(inode); /* @@ -3001,7 +3014,10 @@ static int selinux_inode_init_security_anon(struct inode *inode, isec->sclass = context_isec->sclass; isec->sid = context_isec->sid; } else { - isec->sclass = SECCLASS_ANON_INODE; + if (is_memfd) + isec->sclass = SECCLASS_MEMFD_FILE; + else + isec->sclass = SECCLASS_ANON_INODE; rc = security_transition_sid( sid, sid, isec->sclass, name, &isec->sid); diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 5665aa5e7853..3ec85142771f 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -179,6 +179,8 @@ const struct security_class_mapping secclass_map[] = { { "anon_inode", { COMMON_FILE_PERMS, NULL } }, { "io_uring", { "override_creds", "sqpoll", "cmd", "allowed", NULL } }, { "user_namespace", { "create", NULL } }, + { "memfd_file", + { COMMON_FILE_PERMS, "execute_no_trans", "entrypoint", NULL } }, /* last one */ { NULL, {} } }; diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index 135a969f873c..231d02227e59 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -18,6 +18,7 @@ enum { POLICYDB_CAP_NETIF_WILDCARD, POLICYDB_CAP_GENFS_SECLABEL_WILDCARD, POLICYDB_CAP_FUNCTIONFS_SECLABEL, + POLICYDB_CAP_MEMFD_CLASS, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index ff8882887651..454dab37bda3 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -21,6 +21,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "netif_wildcard", "genfs_seclabel_wildcard", "functionfs_seclabel", + "memfd_class", }; /* clang-format on */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 0f954a40d3fc..5d1dad8058b1 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -209,6 +209,11 @@ static inline bool selinux_policycap_functionfs_seclabel(void) selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]); } +static inline bool selinux_policycap_memfd_class(void) +{ + return READ_ONCE(selinux_state.policycap[POLICYDB_CAP_MEMFD_CLASS]); +} + struct selinux_policy_convert_data; struct selinux_load_state { From 641e0217586193bbd6dbc16ae73d0c9ecda535f1 Mon Sep 17 00:00:00 2001 From: Hongru Zhang Date: Thu, 23 Oct 2025 19:29:19 +0800 Subject: [PATCH 2/4] selinux: Introduce a new config to make avc cache slot size adjustable On mobile device high-load situations, permission check can happen more than 90,000/s (8 core system). With default 512 cache nodes configuration, avc cache miss happens more often and occasionally leads to long time (>2ms) irqs off on both big and little cores, which decreases system real-time capability. An actual call stack is as follows: => avc_compute_av => avc_perm_nonode => avc_has_perm_noaudit => selinux_capable => security_capable => capable => __sched_setscheduler => do_sched_setscheduler => __arm64_sys_sched_setscheduler => invoke_syscall => el0_svc_common => do_el0_svc => el0_svc => el0t_64_sync_handler => el0t_64_sync Although we can expand avc nodes through /sys/fs/selinux/cache_threshold to mitigate long time irqs off, hash conflicts make the bucket average length longer because of the fixed size of cache slots, leading to avc_search_node() latency increase. So introduce a new config to make avc cache slot size also configurable, and with fine tuning, we can mitigate long time irqs off with slightly avc_search_node() performance regression. Theoretically, the main overhead is memory consumption. Signed-off-by: Hongru Zhang Signed-off-by: Paul Moore --- security/selinux/Kconfig | 11 +++++++++++ security/selinux/avc.c | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index 61abc1e094a8..5588c4d573f6 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -69,6 +69,17 @@ config SECURITY_SELINUX_SID2STR_CACHE_SIZE If unsure, keep the default value. +config SECURITY_SELINUX_AVC_HASH_BITS + int "SELinux avc hashtable size" + depends on SECURITY_SELINUX + range 9 14 + default 9 + help + This option sets the number of buckets used in the AVC hash table + to 2^SECURITY_SELINUX_AVC_HASH_BITS. A higher value helps maintain + shorter chain lengths especially when expanding AVC nodes via + /sys/fs/selinux/avc/cache_threshold. + config SECURITY_SELINUX_DEBUG bool "SELinux kernel debugging support" depends on SECURITY_SELINUX diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 430b0e23ee00..c12d45e46db6 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -34,9 +34,9 @@ #define CREATE_TRACE_POINTS #include -#define AVC_CACHE_SLOTS 512 -#define AVC_DEF_CACHE_THRESHOLD 512 -#define AVC_CACHE_RECLAIM 16 +#define AVC_CACHE_SLOTS (1 << CONFIG_SECURITY_SELINUX_AVC_HASH_BITS) +#define AVC_DEF_CACHE_THRESHOLD AVC_CACHE_SLOTS +#define AVC_CACHE_RECLAIM 16 #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS #define avc_cache_stats_incr(field) this_cpu_inc(avc_cache_stats.field) From 929126ef4a0e2723622eb3ba11017ca5fecd37d3 Mon Sep 17 00:00:00 2001 From: Hongru Zhang Date: Thu, 23 Oct 2025 19:29:54 +0800 Subject: [PATCH 3/4] selinux: Move avtab_hash() to a shared location for future reuse This is a preparation patch, no functional change. Signed-off-by: Hongru Zhang Signed-off-by: Paul Moore --- security/selinux/include/hash.h | 46 +++++++++++++++++++++++++++++++++ security/selinux/ss/avtab.c | 41 +---------------------------- 2 files changed, 47 insertions(+), 40 deletions(-) create mode 100644 security/selinux/include/hash.h diff --git a/security/selinux/include/hash.h b/security/selinux/include/hash.h new file mode 100644 index 000000000000..5b429a873eb6 --- /dev/null +++ b/security/selinux/include/hash.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _SELINUX_HASH_H_ +#define _SELINUX_HASH_H_ + +/* Based on MurmurHash3, written by Austin Appleby and placed in the + * public domain. + */ +static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) +{ + static const u32 c1 = 0xcc9e2d51; + static const u32 c2 = 0x1b873593; + static const u32 r1 = 15; + static const u32 r2 = 13; + static const u32 m = 5; + static const u32 n = 0xe6546b64; + + u32 hash = 0; + +#define mix(input) \ + do { \ + u32 v = input; \ + v *= c1; \ + v = (v << r1) | (v >> (32 - r1)); \ + v *= c2; \ + hash ^= v; \ + hash = (hash << r2) | (hash >> (32 - r2)); \ + hash = hash * m + n; \ + } while (0) + + mix(keyp->target_class); + mix(keyp->target_type); + mix(keyp->source_type); + +#undef mix + + hash ^= hash >> 16; + hash *= 0x85ebca6b; + hash ^= hash >> 13; + hash *= 0xc2b2ae35; + hash ^= hash >> 16; + + return hash & mask; +} + +#endif /* _SELINUX_HASH_H_ */ diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index c2c31521cace..15e89d9b5d72 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -20,50 +20,11 @@ #include #include "avtab.h" #include "policydb.h" +#include "hash.h" static struct kmem_cache *avtab_node_cachep __ro_after_init; static struct kmem_cache *avtab_xperms_cachep __ro_after_init; -/* Based on MurmurHash3, written by Austin Appleby and placed in the - * public domain. - */ -static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) -{ - static const u32 c1 = 0xcc9e2d51; - static const u32 c2 = 0x1b873593; - static const u32 r1 = 15; - static const u32 r2 = 13; - static const u32 m = 5; - static const u32 n = 0xe6546b64; - - u32 hash = 0; - -#define mix(input) \ - do { \ - u32 v = input; \ - v *= c1; \ - v = (v << r1) | (v >> (32 - r1)); \ - v *= c2; \ - hash ^= v; \ - hash = (hash << r2) | (hash >> (32 - r2)); \ - hash = hash * m + n; \ - } while (0) - - mix(keyp->target_class); - mix(keyp->target_type); - mix(keyp->source_type); - -#undef mix - - hash ^= hash >> 16; - hash *= 0x85ebca6b; - hash ^= hash >> 13; - hash *= 0xc2b2ae35; - hash ^= hash >> 16; - - return hash & mask; -} - static struct avtab_node *avtab_insert_node(struct avtab *h, struct avtab_node **dst, const struct avtab_key *key, From 20d387d7ceab95aade436c363927b3ab81b0be36 Mon Sep 17 00:00:00 2001 From: Hongru Zhang Date: Thu, 23 Oct 2025 19:30:18 +0800 Subject: [PATCH 4/4] selinux: improve bucket distribution uniformity of avc_hash() Reuse the already implemented MurmurHash3 algorithm. Under heavy stress testing (on an 8-core system sustaining over 50,000 authentication events per second), sample once per second and take the mean of 1800 samples: 1. Bucket utilization rate and length of longest chain +--------------------------+-----------------------------------------+ | | bucket utilization rate / longest chain | | +--------------------+--------------------+ | | no-patch | with-patch | +--------------------------+--------------------+--------------------+ | 512 nodes, 512 buckets | 52.5%/7.5 | 60.2%/5.7 | +--------------------------+--------------------+--------------------+ | 1024 nodes, 512 buckets | 68.9%/12.1 | 80.2%/9.7 | +--------------------------+--------------------+--------------------+ | 2048 nodes, 512 buckets | 83.7%/19.4 | 93.4%/16.3 | +--------------------------+--------------------+--------------------+ | 8192 nodes, 8192 buckets | 49.5%/11.4 | 60.3%/7.4 | +--------------------------+--------------------+--------------------+ 2. avc_search_node latency (total latency of hash operation and table lookup) +--------------------------+-----------------------------------------+ | | latency of function avc_search_node | | +--------------------+--------------------+ | | no-patch | with-patch | +--------------------------+--------------------+--------------------+ | 512 nodes, 512 buckets | 87ns | 84ns | +--------------------------+--------------------+--------------------+ | 1024 nodes, 512 buckets | 97ns | 96ns | +--------------------------+--------------------+--------------------+ | 2048 nodes, 512 buckets | 118ns | 113ns | +--------------------------+--------------------+--------------------+ | 8192 nodes, 8192 buckets | 106ns | 99ns | +--------------------------+--------------------+--------------------+ Although MurmurHash3 has higher overhead than the bitwise operations in the original algorithm, the data shows that the MurmurHash3 achieves better distribution, reducing average lookup time. Consequently, the total latency of hashing and table lookup is lower than before. Signed-off-by: Hongru Zhang [PM: whitespace fixes] Signed-off-by: Paul Moore --- security/selinux/avc.c | 3 ++- security/selinux/include/hash.h | 11 ++++++----- security/selinux/ss/avtab.c | 6 ++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index c12d45e46db6..8f77b9a732e1 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -30,6 +30,7 @@ #include "avc.h" #include "avc_ss.h" #include "classmap.h" +#include "hash.h" #define CREATE_TRACE_POINTS #include @@ -124,7 +125,7 @@ static struct kmem_cache *avc_xperms_cachep __ro_after_init; static inline u32 avc_hash(u32 ssid, u32 tsid, u16 tclass) { - return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1); + return av_hash(ssid, tsid, (u32)tclass, (u32)(AVC_CACHE_SLOTS - 1)); } /** diff --git a/security/selinux/include/hash.h b/security/selinux/include/hash.h index 5b429a873eb6..18956dbef8ff 100644 --- a/security/selinux/include/hash.h +++ b/security/selinux/include/hash.h @@ -3,10 +3,11 @@ #ifndef _SELINUX_HASH_H_ #define _SELINUX_HASH_H_ -/* Based on MurmurHash3, written by Austin Appleby and placed in the +/* + * Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. */ -static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) +static inline u32 av_hash(u32 key1, u32 key2, u32 key3, u32 mask) { static const u32 c1 = 0xcc9e2d51; static const u32 c2 = 0x1b873593; @@ -28,9 +29,9 @@ static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) hash = hash * m + n; \ } while (0) - mix(keyp->target_class); - mix(keyp->target_type); - mix(keyp->source_type); + mix(key1); + mix(key2); + mix(key3); #undef mix diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 15e89d9b5d72..d12ca337e649 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -25,6 +25,12 @@ static struct kmem_cache *avtab_node_cachep __ro_after_init; static struct kmem_cache *avtab_xperms_cachep __ro_after_init; +static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) +{ + return av_hash((u32)keyp->target_class, (u32)keyp->target_type, + (u32)keyp->source_type, mask); +} + static struct avtab_node *avtab_insert_node(struct avtab *h, struct avtab_node **dst, const struct avtab_key *key,