From de7342228b7343774d6a9981c2ddbfb5e201044b Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 4 Oct 2025 22:23:29 +0800 Subject: [PATCH 1/6] bpf: Finish constification of 1st parameter of bpf_d_path() The commit 1b8abbb12128 ("bpf...d_path(): constify path argument") constified the first parameter of the bpf_d_path(), but failed to update it in all places. Finish constification. Otherwise the selftest fail to build: .../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path' 222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym; | ^ .../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym; Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument") Signed-off-by: Rong Tao Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/verifier_vfs_accept.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c77dc40f7689..15d113a1bc1d 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -788,6 +788,7 @@ class PrinterHelpersHeader(Printer): 'struct task_struct', 'struct cgroup', 'struct path', + 'const struct path', 'struct btf_ptr', 'struct inode', 'struct socket', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index 3e2d76ee8050..55398c04290a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -70,7 +70,7 @@ __success int BPF_PROG(path_d_path_from_file_argument, struct file *file) { int ret; - struct path *path; + const struct path *path; /* The f_path member is a path which is embedded directly within a * file. Therefore, a pointer to such embedded members are still From 4d920ed684392ae064af62957d6f5a90312dfaf6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 5 Oct 2025 18:20:37 -0700 Subject: [PATCH 2/6] libbpf: Fix undefined behavior in {get,put}_unaligned_be32() These violate aliasing rules and may be miscompiled unless -fno-strict-aliasing is used. Replace them with the standard memcpy() solution. Note that compilers know how to optimize this properly. Fixes: 4a1c9e544b8d ("libbpf: remove linux/unaligned.h dependency for libbpf_sha256()") Signed-off-by: Eric Biggers Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20251006012037.159295-1-ebiggers@kernel.org --- tools/lib/bpf/libbpf_utils.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf_utils.c b/tools/lib/bpf/libbpf_utils.c index 5d66bc6ff098..ac3beae54cf6 100644 --- a/tools/lib/bpf/libbpf_utils.c +++ b/tools/lib/bpf/libbpf_utils.c @@ -148,16 +148,20 @@ const char *libbpf_errstr(int err) } } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpacked" -#pragma GCC diagnostic ignored "-Wattributes" -struct __packed_u32 { __u32 __val; } __attribute__((packed)); -#pragma GCC diagnostic pop +static inline __u32 get_unaligned_be32(const void *p) +{ + __be32 val; -#define get_unaligned_be32(p) be32_to_cpu((((struct __packed_u32 *)(p))->__val)) -#define put_unaligned_be32(v, p) do { \ - ((struct __packed_u32 *)(p))->__val = cpu_to_be32(v); \ -} while (0) + memcpy(&val, p, sizeof(val)); + return be32_to_cpu(val); +} + +static inline void put_unaligned_be32(__u32 val, void *p) +{ + __be32 be_val = cpu_to_be32(val); + + memcpy(p, &be_val, sizeof(be_val)); +} #define SHA256_BLOCK_LENGTH 64 #define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) From 23f3770e1a53e6c7a553135011f547209e141e72 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Oct 2025 09:34:18 +0200 Subject: [PATCH 3/6] bpf: Fix metadata_dst leak __bpf_redirect_neigh_v{4,6} Cilium has a BPF egress gateway feature which forces outgoing K8s Pod traffic to pass through dedicated egress gateways which then SNAT the traffic in order to interact with stable IPs outside the cluster. The traffic is directed to the gateway via vxlan tunnel in collect md mode. A recent BPF change utilized the bpf_redirect_neigh() helper to forward packets after the arrival and decap on vxlan, which turned out over time that the kmalloc-256 slab usage in kernel was ever-increasing. The issue was that vxlan allocates the metadata_dst object and attaches it through a fake dst entry to the skb. The latter was never released though given bpf_redirect_neigh() was merely setting the new dst entry via skb_dst_set() without dropping an existing one first. Fixes: b4ab31414970 ("bpf: Add redirect_neigh helper as redirect drop-in") Reported-by: Yusuke Suzuki Reported-by: Julian Wiedmann Signed-off-by: Daniel Borkmann Cc: Martin KaFai Lau Cc: Jakub Kicinski Cc: Jordan Rife Reviewed-by: Simon Horman Reviewed-by: Jordan Rife Reviewed-by: Jakub Kicinski Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251003073418.291171-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 5d1838ff1ab9..76628df1fc82 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) goto out_drop; + skb_dst_drop(skb); skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; @@ -2389,6 +2390,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, goto out_drop; } + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } From 07ca98f906a403637fc5e513a872a50ef1247f3b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 8 Oct 2025 18:56:59 +0200 Subject: [PATCH 4/6] xsk: Harden userspace-supplied xdp_desc validation Turned out certain clearly invalid values passed in xdp_desc from userspace can pass xp_{,un}aligned_validate_desc() and then lead to UBs or just invalid frames to be queued for xmit. desc->len close to ``U32_MAX`` with a non-zero pool->tx_metadata_len can cause positive integer overflow and wraparound, the same way low enough desc->addr with a non-zero pool->tx_metadata_len can cause negative integer overflow. Both scenarios can then pass the validation successfully. This doesn't happen with valid XSk applications, but can be used to perform attacks. Always promote desc->len to ``u64`` first to exclude positive overflows of it. Use explicit check_{add,sub}_overflow() when validating desc->addr (which is ``u64`` already). bloat-o-meter reports a little growth of the code size: add/remove: 0/0 grow/shrink: 2/1 up/down: 60/-16 (44) Function old new delta xskq_cons_peek_desc 299 330 +31 xsk_tx_peek_release_desc_batch 973 1002 +29 xsk_generic_xmit 3148 3132 -16 but hopefully this doesn't hurt the performance much. Fixes: 341ac980eab9 ("xsk: Support tx_metadata_len") Cc: stable@vger.kernel.org # 6.8+ Signed-off-by: Alexander Lobakin Reviewed-by: Jason Xing Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20251008165659.4141318-1-aleksander.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index f16f390370dc..1eb8d9f8b104 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -143,14 +143,24 @@ static inline bool xp_unused_options_set(u32 options) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = desc->addr - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; - u64 offset = addr & (pool->chunk_size - 1); + u64 len = desc->len; + u64 addr, offset; - if (!desc->len) + if (!len) return false; - if (offset + len > pool->chunk_size) + /* Can overflow if desc->addr < pool->tx_metadata_len */ + if (check_sub_overflow(desc->addr, pool->tx_metadata_len, &addr)) + return false; + + offset = addr & (pool->chunk_size - 1); + + /* + * Can't overflow: @offset is guaranteed to be < ``U32_MAX`` + * (pool->chunk_size is ``u32``), @len is guaranteed + * to be <= ``U32_MAX``. + */ + if (offset + len + pool->tx_metadata_len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) @@ -158,27 +168,42 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (xp_unused_options_set(desc->options)) return false; + return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; + u64 len = desc->len; + u64 addr, end; - if (!desc->len) + if (!len) return false; + /* Can't overflow: @len is guaranteed to be <= ``U32_MAX`` */ + len += pool->tx_metadata_len; if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, len)) + /* Can overflow if desc->addr is close to 0 */ + if (check_sub_overflow(xp_unaligned_add_offset_to_addr(desc->addr), + pool->tx_metadata_len, &addr)) + return false; + + if (addr >= pool->addrs_cnt) + return false; + + /* Can overflow if pool->addrs_cnt is high enough */ + if (check_add_overflow(addr, len, &end) || end > pool->addrs_cnt) + return false; + + if (xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; + return true; } From 4f375ade6aa9f37fd72d7a78682f639772089eed Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Wed, 8 Oct 2025 18:26:26 +0800 Subject: [PATCH 5/6] bpf: Avoid RCU context warning when unpinning htab with internal structs When unpinning a BPF hash table (htab or htab_lru) that contains internal structures (timer, workqueue, or task_work) in its values, a BUG warning is triggered: BUG: sleeping function called from invalid context at kernel/bpf/hashtab.c:244 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 14, name: ksoftirqd/0 ... The issue arises from the interaction between BPF object unpinning and RCU callback mechanisms: 1. BPF object unpinning uses ->free_inode() which schedules cleanup via call_rcu(), deferring the actual freeing to an RCU callback that executes within the RCU_SOFTIRQ context. 2. During cleanup of hash tables containing internal structures, htab_map_free_internal_structs() is invoked, which includes cond_resched() or cond_resched_rcu() calls to yield the CPU during potentially long operations. However, cond_resched() or cond_resched_rcu() cannot be safely called from atomic RCU softirq context, leading to the BUG warning when attempting to reschedule. Fix this by changing from ->free_inode() to ->destroy_inode() and rename bpf_free_inode() to bpf_destroy_inode() for BPF objects (prog, map, link). This allows direct inode freeing without RCU callback scheduling, avoiding the invalid context warning. Reported-by: Le Chen Closes: https://lore.kernel.org/all/1444123482.1827743.1750996347470.JavaMail.zimbra@sjtu.edu.cn/ Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Suggested-by: Alexei Starovoitov Signed-off-by: KaFai Wan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251008102628.808045-2-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f90bdcc0a047..81780bcf8d25 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -775,7 +775,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_free_inode(struct inode *inode) +static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; @@ -790,7 +790,7 @@ const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, - .free_inode = bpf_free_inode, + .destroy_inode = bpf_destroy_inode, }; enum { From accb9a7e87f096a12eb21256107b9c8e343f8019 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Wed, 8 Oct 2025 18:26:27 +0800 Subject: [PATCH 6/6] selftests/bpf: Add test for unpinning htab with internal timer struct Add test to verify that unpinning hash tables containing internal timer structures does not trigger context warnings. Each subtest (timer_prealloc and timer_no_prealloc) can trigger the context warning when unpinning, but the warning cannot be triggered twice within a short time interval (a HZ), which is expected behavior. Signed-off-by: KaFai Wan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251008102628.808045-3-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pinning_htab.c | 36 +++++++++++++++++++ .../selftests/bpf/progs/test_pinning_htab.c | 25 +++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/pinning_htab.c create mode 100644 tools/testing/selftests/bpf/progs/test_pinning_htab.c diff --git a/tools/testing/selftests/bpf/prog_tests/pinning_htab.c b/tools/testing/selftests/bpf/prog_tests/pinning_htab.c new file mode 100644 index 000000000000..16bd74be3dbe --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pinning_htab.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_pinning_htab.skel.h" + +static void unpin_map(const char *map_name, const char *pin_path) +{ + struct test_pinning_htab *skel; + struct bpf_map *map; + int err; + + skel = test_pinning_htab__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open_and_load")) + return; + + map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name")) + goto out; + + err = bpf_map__pin(map, pin_path); + if (!ASSERT_OK(err, "bpf_map__pin")) + goto out; + + err = bpf_map__unpin(map, pin_path); + ASSERT_OK(err, "bpf_map__unpin"); +out: + test_pinning_htab__destroy(skel); +} + +void test_pinning_htab(void) +{ + if (test__start_subtest("timer_prealloc")) + unpin_map("timer_prealloc", "/sys/fs/bpf/timer_prealloc"); + if (test__start_subtest("timer_no_prealloc")) + unpin_map("timer_no_prealloc", "/sys/fs/bpf/timer_no_prealloc"); +} diff --git a/tools/testing/selftests/bpf/progs/test_pinning_htab.c b/tools/testing/selftests/bpf/progs/test_pinning_htab.c new file mode 100644 index 000000000000..ae227930c73c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_pinning_htab.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct timer_val { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, struct timer_val); + __uint(max_entries, 1); +} timer_prealloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, struct timer_val); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_NO_PREALLOC); +} timer_no_prealloc SEC(".maps");