net/smc: bpf: Introduce generic hook for handshake flow

The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, to introduce a new generic hook that
allows decisions on whether to use SMC or not at runtime, including
but not limited to local/remote IP address or ports.

User can write their own implememtion via bpf_struct_ops now to choose
whether to use SMC or not before TCP 3rd handshake to be comleted.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com
This commit is contained in:
D. Wythe 2025-11-07 11:56:31 +08:00 committed by Martin KaFai Lau
parent 07c428ece3
commit 15f295f556
9 changed files with 355 additions and 14 deletions

View File

@ -17,6 +17,9 @@ struct netns_smc {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *smc_hdr;
#endif
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
struct smc_hs_ctrl __rcu *hs_ctrl;
#endif /* CONFIG_SMC_HS_CTRL_BPF */
unsigned int sysctl_autocorking_size;
unsigned int sysctl_smcr_buf_type;
int sysctl_smcr_testlink_time;

View File

@ -17,6 +17,8 @@
#include <linux/wait.h>
#include <linux/dibs.h>
struct tcp_sock;
struct inet_request_sock;
struct sock;
#define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */
@ -50,4 +52,55 @@ struct smcd_dev {
u8 going_away : 1;
};
#define SMC_HS_CTRL_NAME_MAX 16
enum {
/* ops can be inherit from init_net */
SMC_HS_CTRL_FLAG_INHERITABLE = 0x1,
SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE,
};
struct smc_hs_ctrl {
/* private */
struct list_head list;
struct module *owner;
/* public */
/* unique name */
char name[SMC_HS_CTRL_NAME_MAX];
int flags;
/* Invoked before computing SMC option for SYN packets.
* We can control whether to set SMC options by returning various value.
* Return 0 to disable SMC, or return any other value to enable it.
*/
int (*syn_option)(struct tcp_sock *tp);
/* Invoked before Set up SMC options for SYN-ACK packets
* We can control whether to respond SMC options by returning various
* value. Return 0 to disable SMC, or return any other value to enable
* it.
*/
int (*synack_option)(const struct tcp_sock *tp,
struct inet_request_sock *ireq);
};
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
#define smc_call_hsbpf(init_val, tp, func, ...) ({ \
typeof(init_val) __ret = (init_val); \
struct smc_hs_ctrl *ctrl; \
rcu_read_lock(); \
ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \
if (ctrl && ctrl->func) \
__ret = ctrl->func(tp, ##__VA_ARGS__); \
rcu_read_unlock(); \
__ret; \
})
#else
#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); })
#endif /* CONFIG_SMC_HS_CTRL_BPF */
#endif /* _SMC_H */

View File

@ -40,6 +40,7 @@
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/smc.h>
#include <net/proto_memory.h>
#include <net/psp.h>
@ -802,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
mptcp_options_write(th, ptr, tp, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
static void smc_set_option(struct tcp_sock *tp,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc) {
if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
/* re-check syn_smc */
if (tp->syn_smc &&
*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
}
static void smc_set_option_cond(const struct tcp_sock *tp,
const struct inet_request_sock *ireq,
struct inet_request_sock *ireq,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc && ireq->smc_ok) {
if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
/* re-check smc_ok */
if (ireq->smc_ok &&
*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif

View File

@ -19,3 +19,13 @@ config SMC_DIAG
smcss.
if unsure, say Y.
config SMC_HS_CTRL_BPF
bool "Generic eBPF hook for SMC handshake flow"
depends on SMC && BPF_SYSCALL
default y
help
SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC
handshake flow, which offer much greater flexibility in modifying the behavior
of the SMC protocol stack compared to a complete kernel-based approach. Select
this option if you want filtring the handshake process via eBPF programs.

View File

@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o smc_inet.o
smc-$(CONFIG_SYSCTL) += smc_sysctl.o
smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o

View File

@ -58,6 +58,7 @@
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_inet.h"
#include "smc_hs_bpf.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
@ -3600,8 +3601,16 @@ static int __init smc_init(void)
pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
goto out_ulp;
}
rc = bpf_smc_hs_ctrl_init();
if (rc) {
pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__,
rc);
goto out_inet;
}
static_branch_enable(&tcp_have_smc);
return 0;
out_inet:
smc_inet_exit();
out_ulp:
tcp_unregister_ulp(&smc_ulp_ops);
out_ib:

140
net/smc/smc_hs_bpf.c Normal file
View File

@ -0,0 +1,140 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Generic hook for SMC handshake flow.
*
* Copyright IBM Corp. 2016
* Copyright (c) 2025, Alibaba Inc.
*
* Author: D. Wythe <alibuda@linux.alibaba.com>
*/
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/rculist.h>
#include "smc_hs_bpf.h"
static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock);
static LIST_HEAD(smc_hs_ctrl_list);
static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl)
{
int ret = 0;
spin_lock(&smc_hs_ctrl_list_lock);
/* already exist or duplicate name */
if (smc_hs_ctrl_find_by_name(ctrl->name))
ret = -EEXIST;
else
list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list);
spin_unlock(&smc_hs_ctrl_list_lock);
return ret;
}
static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl)
{
spin_lock(&smc_hs_ctrl_list_lock);
list_del_rcu(&ctrl->list);
spin_unlock(&smc_hs_ctrl_list_lock);
/* Ensure that all readers to complete */
synchronize_rcu();
}
struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name)
{
struct smc_hs_ctrl *ctrl;
list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) {
if (strcmp(ctrl->name, name) == 0)
return ctrl;
}
return NULL;
}
static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; }
static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp,
struct inet_request_sock *ireq)
{
return 1;
}
static struct smc_hs_ctrl __smc_bpf_hs_ctrl = {
.syn_option = __smc_bpf_stub_set_tcp_option,
.synack_option = __smc_bpf_stub_set_tcp_option_cond,
};
static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; }
static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link)
{
if (link)
return -EOPNOTSUPP;
return smc_hs_ctrl_reg(kdata);
}
static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link)
{
smc_hs_ctrl_unreg(kdata);
}
static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata)
{
const struct smc_hs_ctrl *u_ctrl;
struct smc_hs_ctrl *k_ctrl;
u32 moff;
u_ctrl = (const struct smc_hs_ctrl *)udata;
k_ctrl = (struct smc_hs_ctrl *)kdata;
moff = __btf_member_bit_offset(t, member) / 8;
switch (moff) {
case offsetof(struct smc_hs_ctrl, name):
if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name,
sizeof(u_ctrl->name)) <= 0)
return -EINVAL;
return 1;
case offsetof(struct smc_hs_ctrl, flags):
if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS)
return -EINVAL;
k_ctrl->flags = u_ctrl->flags;
return 1;
default:
break;
}
return 0;
}
static const struct bpf_func_proto *
bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
return bpf_base_func_proto(func_id, prog);
}
static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
.get_func_proto = bpf_smc_hs_func_proto,
.is_valid_access = bpf_tracing_btf_ctx_access,
};
static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = {
.name = "smc_hs_ctrl",
.init = smc_bpf_hs_ctrl_init,
.reg = smc_bpf_hs_ctrl_reg,
.unreg = smc_bpf_hs_ctrl_unreg,
.cfi_stubs = &__smc_bpf_hs_ctrl,
.verifier_ops = &smc_bpf_verifier_ops,
.init_member = smc_bpf_hs_ctrl_init_member,
.owner = THIS_MODULE,
};
int bpf_smc_hs_ctrl_init(void)
{
return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl);
}

31
net/smc/smc_hs_bpf.h Normal file
View File

@ -0,0 +1,31 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Generic hook for SMC handshake flow.
*
* Copyright IBM Corp. 2016
* Copyright (c) 2025, Alibaba Inc.
*
* Author: D. Wythe <alibuda@linux.alibaba.com>
*/
#ifndef __SMC_HS_CTRL
#define __SMC_HS_CTRL
#include <net/smc.h>
/* Find hs_ctrl by the target name, which required to be a c-string.
* Return NULL if no such ctrl was found,otherwise, return a valid ctrl.
*
* Note: Caller MUST ensure it's was invoked under rcu_read_lock.
*/
struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name);
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
int bpf_smc_hs_ctrl_init(void);
#else
static inline int bpf_smc_hs_ctrl_init(void) { return 0; }
#endif /* CONFIG_SMC_HS_CTRL_BPF */
#endif /* __SMC_HS_CTRL */

View File

@ -12,12 +12,14 @@
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/bpf.h>
#include <net/net_namespace.h>
#include "smc.h"
#include "smc_core.h"
#include "smc_llc.h"
#include "smc_sysctl.h"
#include "smc_hs_bpf.h"
static int min_sndbuf = SMC_BUF_MIN_SIZE;
static int min_rcvbuf = SMC_BUF_MIN_SIZE;
@ -32,6 +34,69 @@ static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
static unsigned int smcr_max_wr_min = 2;
static unsigned int smcr_max_wr_max = 2048;
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name)
{
struct smc_hs_ctrl *ctrl = NULL;
rcu_read_lock();
/* null or empty name ask to clear current ctrl */
if (name && name[0]) {
ctrl = smc_hs_ctrl_find_by_name(name);
if (!ctrl) {
rcu_read_unlock();
return -EINVAL;
}
/* no change, just return */
if (ctrl == rcu_dereference(net->smc.hs_ctrl)) {
rcu_read_unlock();
return 0;
}
if (!bpf_try_module_get(ctrl, ctrl->owner)) {
rcu_read_unlock();
return -EBUSY;
}
}
/* xhcg old ctrl with the new one atomically */
ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl)));
/* release old ctrl */
if (ctrl)
bpf_module_put(ctrl, ctrl->owner);
rcu_read_unlock();
return 0;
}
static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl);
char val[SMC_HS_CTRL_NAME_MAX];
const struct ctl_table tbl = {
.data = val,
.maxlen = SMC_HS_CTRL_NAME_MAX,
};
struct smc_hs_ctrl *ctrl;
int ret;
rcu_read_lock();
ctrl = rcu_dereference(net->smc.hs_ctrl);
if (ctrl)
memcpy(val, ctrl->name, sizeof(ctrl->name));
else
val[0] = '\0';
rcu_read_unlock();
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (ret)
return ret;
if (write)
ret = smc_net_replace_smc_hs_ctrl(net, val);
return ret;
}
#endif /* CONFIG_SMC_HS_CTRL_BPF */
static struct ctl_table smc_table[] = {
{
.procname = "autocorking_size",
@ -119,6 +184,15 @@ static struct ctl_table smc_table[] = {
.extra1 = &smcr_max_wr_min,
.extra2 = &smcr_max_wr_max,
},
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
{
.procname = "hs_ctrl",
.data = &init_net.smc.hs_ctrl,
.mode = 0644,
.maxlen = SMC_HS_CTRL_NAME_MAX,
.proc_handler = proc_smc_hs_ctrl,
},
#endif /* CONFIG_SMC_HS_CTRL_BPF */
};
int __net_init smc_sysctl_net_init(struct net *net)
@ -129,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net)
table = smc_table;
if (!net_eq(net, &init_net)) {
int i;
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
struct smc_hs_ctrl *ctrl;
rcu_read_lock();
ctrl = rcu_dereference(init_net.smc.hs_ctrl);
if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE &&
bpf_try_module_get(ctrl, ctrl->owner))
rcu_assign_pointer(net->smc.hs_ctrl, ctrl);
rcu_read_unlock();
#endif /* CONFIG_SMC_HS_CTRL_BPF */
table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
if (!table)
@ -161,6 +245,9 @@ int __net_init smc_sysctl_net_init(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
smc_net_replace_smc_hs_ctrl(net, NULL);
#endif /* CONFIG_SMC_HS_CTRL_BPF */
return -ENOMEM;
}
@ -170,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net)
table = net->smc.smc_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->smc.smc_hdr);
#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
smc_net_replace_smc_hs_ctrl(net, NULL);
#endif /* CONFIG_SMC_HS_CTRL_BPF */
if (!net_eq(net, &init_net))
kfree(table);
}