diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 596c306ce52b..b93beb247a11 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,6 +339,15 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info + - + name: lease + doc: | + A queue from a virtual device can have a lease which refers to + another queue from a physical device. This is useful for memory + providers and AF_XDP operations which take an ifindex and queue id + to allow applications to bind against virtual devices in containers. + type: nest + nested-attributes: lease - name: qstats doc: | @@ -537,6 +546,26 @@ attribute-sets: name: id - name: type + - + name: lease + attributes: + - + name: ifindex + doc: The netdev ifindex to lease the queue from. + type: u32 + checks: + min: 1 + - + name: queue + doc: The netdev queue to lease from. + type: nest + nested-attributes: queue-id + - + name: netns-id + doc: The network namespace id of the netdev. + type: s32 + checks: + min: 0 - name: dmabuf attributes: @@ -686,6 +715,7 @@ operations: - dmabuf - io-uring - xsk + - lease dump: request: attributes: @@ -797,6 +827,22 @@ operations: reply: attributes: - id + - + name: queue-create + doc: | + Create a new queue for the given netdevice. Whether this operation + is supported depends on the device and the driver. + attribute-set: queue + flags: [admin-perm] + do: + request: + attributes: + - ifindex + - type + - lease + reply: &queue-create-op + attributes: + - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index df4b56beb818..fcb5aaf0926f 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -825,6 +825,13 @@ definitions: entries: - name: none - name: default + - + name: netkit-pairing + type: enum + enum-name: netkit-pairing + entries: + - name: pair + - name: single - name: ovpn-mode enum-name: ovpn-mode @@ -2299,6 +2306,10 @@ attribute-sets: - name: tailroom type: u16 + - + name: pairing + type: u32 + enum: netkit-pairing - name: linkinfo-ovpn-attrs name-prefix: ifla-ovpn- diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 35704d115312..83e28b96884f 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -329,6 +329,12 @@ by setting ``request_ops_lock`` to true. Code comments and docs refer to drivers which have ops called under the instance lock as "ops locked". See also the documentation of the ``lock`` member of struct net_device. +There is also a case of taking two per-netdev locks in sequence when netdev +queues are leased, that is, the netdev-scope lock is taken for both the +virtual and the physical device. To prevent deadlocks, the virtual device's +lock must always be acquired before the physical device's (see +``netdev_nl_queue_create_doit``). + In the future, there will be an option for individual drivers to opt out of using ``rtnl_lock`` and instead perform their control operations directly under the netdev instance lock. diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 5c0e01396e06..5619209329d5 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -9,11 +9,21 @@ #include #include +#include +#include +#include +#include #include #include #include -#define DRV_NAME "netkit" +#define NETKIT_DRV_NAME "netkit" + +#define NETKIT_NUM_RX_QUEUES_MAX 1024 +#define NETKIT_NUM_TX_QUEUES_MAX 1 + +#define NETKIT_NUM_RX_QUEUES_REAL 1 +#define NETKIT_NUM_TX_QUEUES_REAL 1 struct netkit { __cacheline_group_begin(netkit_fastpath); @@ -26,6 +36,7 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; + enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -36,6 +47,8 @@ struct netkit_link { struct net_device *dev; }; +static struct rtnl_link_ops netkit_link_ops; + static __always_inline int netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, enum netkit_action ret) @@ -135,6 +148,10 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); + if (nk->pair == NETKIT_DEVICE_SINGLE) { + netif_carrier_on(dev); + return 0; + } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -194,16 +211,17 @@ static void netkit_set_headroom(struct net_device *dev, int headroom) rcu_read_lock(); peer = rcu_dereference(nk->peer); - if (unlikely(!peer)) - goto out; + if (!peer) { + nk->headroom = headroom; + dev->needed_headroom = headroom; + } else { + nk2 = netkit_priv(peer); + nk->headroom = headroom; + headroom = max(nk->headroom, nk2->headroom); - nk2 = netkit_priv(peer); - nk->headroom = headroom; - headroom = max(nk->headroom, nk2->headroom); - - peer->needed_headroom = headroom; - dev->needed_headroom = headroom; -out: + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; + } rcu_read_unlock(); } @@ -219,9 +237,96 @@ static void netkit_get_stats(struct net_device *dev, stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); } +static bool netkit_xsk_supported_at_phys(const struct net_device *dev) +{ + if (!dev->netdev_ops->ndo_bpf || + !dev->netdev_ops->ndo_xdp_xmit || + !dev->netdev_ops->ndo_xsk_wakeup) + return false; + return true; +} + +static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct netkit *nk = netkit_priv(dev); + struct netdev_bpf xdp_lower; + struct netdev_rx_queue *rxq; + struct net_device *phys; + bool create = false; + int ret = -EBUSY; + + switch (xdp->command) { + case XDP_SETUP_XSK_POOL: + if (nk->pair == NETKIT_DEVICE_PAIR) + return -EOPNOTSUPP; + if (xdp->xsk.queue_id >= dev->real_num_rx_queues) + return -EINVAL; + + rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); + if (!rxq->lease) + return -EOPNOTSUPP; + + phys = rxq->lease->dev; + if (!netkit_xsk_supported_at_phys(phys)) + return -EOPNOTSUPP; + + create = xdp->xsk.pool; + memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); + xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); + break; + case XDP_SETUP_PROG: + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + netdev_lock(phys); + if (create && + (phys->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) { + ret = -EOPNOTSUPP; + goto out; + } + if (!create || !dev_get_min_mp_channel_count(phys)) + ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); +out: + netdev_unlock(phys); + return ret; +} + +static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) +{ + struct netdev_rx_queue *rxq, *rxq_lease; + struct net_device *phys; + + if (queue_id >= dev->real_num_rx_queues) + return -EINVAL; + + rxq = __netif_get_rx_queue(dev, queue_id); + rxq_lease = READ_ONCE(rxq->lease); + if (unlikely(!rxq_lease)) + return -EOPNOTSUPP; + + /* netkit_xsk already validated full xsk support, hence it's + * fine to call into ndo_xsk_wakeup right away given this + * was a prerequisite to get here in the first place. The + * phys xsk support cannot change without tearing down the + * device (which clears the lease first). + */ + phys = rxq_lease->dev; + return phys->netdev_ops->ndo_xsk_wakeup(phys, + get_netdev_rx_queue_index(rxq_lease), flags); +} + +static int netkit_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + return 0; +} + static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { + .ndo_init = netkit_init, .ndo_open = netkit_open, .ndo_stop = netkit_close, .ndo_start_xmit = netkit_xmit, @@ -232,19 +337,104 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, + .ndo_bpf = netkit_xsk, + .ndo_xsk_wakeup = netkit_xsk_wakeup, .ndo_features_check = passthru_features_check, }; static void netkit_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strscpy(info->driver, DRV_NAME, sizeof(info->driver)); + strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); } static const struct ethtool_ops netkit_ethtool_ops = { .get_drvinfo = netkit_get_drvinfo, }; +static int netkit_queue_create(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netkit *nk = netkit_priv(dev); + u32 rxq_count_old, rxq_count_new; + int err; + + rxq_count_old = dev->real_num_rx_queues; + rxq_count_new = rxq_count_old + 1; + + /* In paired mode, only the non-primary (peer) device can + * create leased queues since the primary is the management + * side. In single device mode, leasing is always allowed. + */ + if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) { + NL_SET_ERR_MSG(extack, + "netkit can only lease against the peer device"); + return -EOPNOTSUPP; + } + + err = netif_set_real_num_rx_queues(dev, rxq_count_new); + if (err) { + if (rxq_count_new > dev->num_rx_queues) + NL_SET_ERR_MSG(extack, + "netkit maximum queue limit reached"); + else + NL_SET_ERR_MSG_FMT(extack, + "netkit cannot create more queues err=%d", err); + return err; + } + + return rxq_count_old; +} + +static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { + .ndo_queue_create = netkit_queue_create, +}; + +static struct net_device *netkit_alloc(struct nlattr *tb[], + const char *ifname, + unsigned char name_assign_type, + unsigned int num_tx_queues, + unsigned int num_rx_queues) +{ + const struct rtnl_link_ops *ops = &netkit_link_ops; + struct net_device *dev; + + if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || + num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) + return ERR_PTR(-EOPNOTSUPP); + + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + if (dev) { + dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; + dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; + } + return dev; +} + +static void netkit_queue_unlease(struct net_device *dev) +{ + struct netdev_rx_queue *rxq, *rxq_lease; + struct net_device *dev_lease; + int i; + + if (dev->real_num_rx_queues == 1) + return; + + netdev_lock(dev); + for (i = 1; i < dev->real_num_rx_queues; i++) { + rxq = __netif_get_rx_queue(dev, i); + rxq_lease = rxq->lease; + dev_lease = rxq_lease->dev; + + netdev_lock(dev_lease); + netdev_rx_queue_unlease(rxq, rxq_lease); + netdev_unlock(dev_lease); + } + netdev_unlock(dev); +} + static void netkit_setup(struct net_device *dev) { static const netdev_features_t netkit_features_hw_vlan = @@ -275,8 +465,9 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; - dev->ethtool_ops = &netkit_ethtool_ops; - dev->netdev_ops = &netkit_netdev_ops; + dev->netdev_ops = &netkit_netdev_ops; + dev->ethtool_ops = &netkit_ethtool_ops; + dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; dev->features |= netkit_features; dev->hw_features = netkit_features; @@ -325,8 +516,6 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static struct rtnl_link_ops netkit_link_ops; - static int netkit_new_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) @@ -335,15 +524,17 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; + enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; + bool seen_peer = false, seen_scrub = false; struct nlattr **data = params->data; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer; + struct net_device *peer = NULL; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -380,6 +571,13 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); + if (data[IFLA_NETKIT_PAIRING]) + pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); + + seen_scrub = data[IFLA_NETKIT_SCRUB]; + seen_peer = data[IFLA_NETKIT_PEER_INFO] || + data[IFLA_NETKIT_PEER_SCRUB] || + data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -392,45 +590,47 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; + if (pair == NETKIT_DEVICE_SINGLE && + (tb != tbp || seen_peer || seen_scrub || + policy_prim != NETKIT_PASS)) + return -EOPNOTSUPP; - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); + if (pair == NETKIT_DEVICE_PAIR) { + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); - netif_inherit_tso_max(peer, dev); - if (headroom) { - peer->needed_headroom = headroom; - dev->needed_headroom = headroom; + netif_inherit_tso_max(peer, dev); + if (headroom) + peer->needed_headroom = headroom; + if (tailroom) + peer->needed_tailroom = tailroom; + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->pair = pair; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; } - if (tailroom) { - peer->needed_tailroom = tailroom; - dev->needed_tailroom = tailroom; - } - - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; - - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); - - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -438,15 +638,23 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); + if (headroom) + dev->needed_headroom = headroom; + if (tailroom) + dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; + nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); + if (pair == NETKIT_DEVICE_SINGLE) + xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); + err = register_netdevice(dev); if (err < 0) goto err_configure_peer; @@ -455,10 +663,12 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + if (peer) + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - unregister_netdevice(peer); + if (peer) + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -518,6 +728,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); + if (nk->pair == NETKIT_DEVICE_SINGLE) + return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -844,6 +1056,7 @@ static void netkit_release_all(struct net_device *dev) static void netkit_uninit(struct net_device *dev) { netkit_release_all(dev); + netkit_queue_unlease(dev); } static void netkit_del_link(struct net_device *dev, struct list_head *head) @@ -856,7 +1069,15 @@ static void netkit_del_link(struct net_device *dev, struct list_head *head) if (peer) { nk = netkit_priv(peer); RCU_INIT_POINTER(nk->peer, NULL); - unregister_netdevice_queue(peer, head); + /* Guard against the peer already being in an unregister + * list (e.g. same-namespace teardown where the peer is + * in the caller's dev_kill_list). list_move_tail() on an + * already-queued device would otherwise corrupt that + * list's iteration. This situation can occur via netkit + * notifier, hence guard against this scenario. + */ + if (!unregister_netdevice_queued(peer)) + unregister_netdevice_queue(peer, head); } } @@ -879,6 +1100,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, + { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -898,9 +1120,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { + err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - err = netkit_check_policy(policy, attr, extack); + if (nk->pair == NETKIT_DEVICE_PAIR) + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -921,6 +1145,50 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return 0; } +static void netkit_check_lease_unregister(struct net_device *dev) +{ + LIST_HEAD(list_kill); + u32 q_idx; + + if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || + !dev->dev.parent) + return; + + netdev_lock_ops(dev); + for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { + struct net_device *tmp = dev; + struct netdev_rx_queue *rxq; + u32 tmp_q_idx = q_idx; + + rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx, + NETIF_PHYS_TO_VIRT); + if (rxq && tmp != dev && + tmp->netdev_ops == &netkit_netdev_ops) { + /* A single phys device can have multiple queues leased + * to one netkit device. We can only queue that netkit + * device once to the list_kill. Queues of that phys + * device can be leased with different individual netkit + * devices, hence we batch via list_kill. + */ + if (unregister_netdevice_queued(tmp)) + continue; + netkit_del_link(tmp, &list_kill); + } + } + netdev_unlock_ops(dev); + unregister_netdevice_many(&list_kill); +} + +static int netkit_notifier(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event == NETDEV_UNREGISTER) + netkit_check_lease_unregister(dev); + return NOTIFY_DONE; +} + static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ @@ -931,6 +1199,7 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -951,6 +1220,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -972,13 +1243,15 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), + [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; static struct rtnl_link_ops netkit_link_ops = { - .kind = DRV_NAME, + .kind = NETKIT_DRV_NAME, .priv_size = sizeof(struct netkit), + .alloc = netkit_alloc, .setup = netkit_setup, .newlink = netkit_new_link, .dellink = netkit_del_link, @@ -992,26 +1265,39 @@ static struct rtnl_link_ops netkit_link_ops = { .maxtype = IFLA_NETKIT_MAX, }; -static __init int netkit_init(void) +static struct notifier_block netkit_netdev_notifier = { + .notifier_call = netkit_notifier, +}; + +static __init int netkit_mod_init(void) { + int ret; + BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || (int)NETKIT_PASS != (int)TCX_PASS || (int)NETKIT_DROP != (int)TCX_DROP || (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); - return rtnl_link_register(&netkit_link_ops); + ret = rtnl_link_register(&netkit_link_ops); + if (ret) + return ret; + ret = register_netdevice_notifier(&netkit_netdev_notifier); + if (ret) + rtnl_link_unregister(&netkit_link_ops); + return ret; } -static __exit void netkit_exit(void) +static __exit void netkit_mod_exit(void) { + unregister_netdevice_notifier(&netkit_netdev_notifier); rtnl_link_unregister(&netkit_link_ops); } -module_init(netkit_init); -module_exit(netkit_exit); +module_init(netkit_mod_init); +module_exit(netkit_mod_exit); MODULE_DESCRIPTION("BPF-programmable network device"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_AUTHOR("Nikolay Aleksandrov "); MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK(DRV_NAME); +MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e15367373f7c..47417b2d48a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2561,7 +2561,14 @@ struct net_device { * Also protects some fields in: * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * - * Ordering: take after rtnl_lock. + * Ordering: + * + * - take after rtnl_lock + * + * - for the case of netdev queue leasing, the netdev-scope lock is + * taken for both the virtual and the physical device; to prevent + * deadlocks, the virtual device's lock must always be acquired + * before the physical device's (see netdev_nl_queue_create_doit) */ struct mutex lock; @@ -3413,6 +3420,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); +bool unregister_netdevice_queued(const struct net_device *dev); + static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 95ed28212f4e..70c9fe9e83cc 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -150,6 +150,11 @@ enum { * When NIC-wide config is changed the callback will * be invoked for all queues. * + * @ndo_queue_create: Create a new RX queue on a virtual device that will + * be paired with a physical device's queue via leasing. + * Return the new queue id on success, negative error + * on failure. + * * @supported_params: Bitmask of supported parameters, see QCFG_*. * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while @@ -178,6 +183,8 @@ struct netdev_queue_mgmt_ops { struct netlink_ext_ack *extack); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); + int (*ndo_queue_create)(struct net_device *dev, + struct netlink_ext_ack *extack); unsigned int supported_params; }; @@ -185,7 +192,7 @@ struct netdev_queue_mgmt_ops { void netdev_queue_config(struct net_device *dev, int rxq, struct netdev_queue_config *qcfg); -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -373,6 +380,14 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) get_desc, start_thrs); \ }) -struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); - -#endif +struct device *netdev_queue_get_dma_dev(struct net_device *dev, + unsigned int idx, + enum netdev_queue_type type); +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_queue_busy(struct net_device *dev, unsigned int idx, + enum netdev_queue_type type, + struct netlink_ext_ack *extack); +#endif /* _LINUX_NET_QUEUES_H */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 08f81329fc11..7e98c679ea84 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -31,6 +31,14 @@ struct netdev_rx_queue { struct napi_struct *napi; struct netdev_queue_config qcfg; struct pp_memory_provider_params mp_params; + + /* If a queue is leased, then the lease pointer is always + * valid. From the physical device it points to the virtual + * queue, and from the virtual device it points to the + * physical queue. + */ + struct netdev_rx_queue *lease; + netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -59,6 +67,23 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) return index; } -int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); +enum netif_lease_dir { + NETIF_VIRT_TO_PHYS, + NETIF_PHYS_TO_VIRT, +}; -#endif +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, + enum netif_lease_dir dir); + +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev); + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +#endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index ada4f968960a..255ce4cfd975 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,14 +23,10 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *p); -int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, +int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *old_p); -void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); /** diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 83a96c56b8ca..280bb1780512 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,6 +1296,11 @@ enum netkit_mode { NETKIT_L3, }; +enum netkit_pairing { + NETKIT_DEVICE_PAIR, + NETKIT_DEVICE_SINGLE, +}; + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1320,6 +1325,7 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, + IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 62d693287457..f4a7809ba0c2 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -552,8 +552,11 @@ static void io_close_queue(struct io_zcrx_ifq *ifq) } if (netdev) { - if (ifq->if_rxq != -1) - net_mp_close_rxq(netdev, ifq->if_rxq, &p); + if (ifq->if_rxq != -1) { + netdev_lock(netdev); + netif_mp_close_rxq(netdev, ifq->if_rxq, &p); + netdev_unlock(netdev); + } netdev_put(netdev, &netdev_tracker); } ifq->if_rxq = -1; @@ -826,7 +829,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); - ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq, + NETDEV_QUEUE_TYPE_RX); if (!ifq->dev) { ret = -EOPNOTSUPP; goto netdev_put_unlock; @@ -841,7 +845,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, mp_param.rx_page_size = 1U << ifq->niov_shift; mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; - ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); + ret = netif_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); if (ret) goto netdev_put_unlock; netdev_unlock(ifq->netdev); diff --git a/net/core/dev.c b/net/core/dev.c index 5a31f9d2128c..e7bc95cbd1fa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1121,6 +1121,14 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) return __netdev_put_lock_ops_compat(dev, net); } +struct net_device * +netdev_put_lock(struct net_device *dev, struct net *net, + netdevice_tracker *tracker) +{ + netdev_tracker_free(dev, tracker); + return __netdev_put_lock(dev, net); +} + struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) @@ -12342,10 +12350,8 @@ static void dev_memory_provider_uninstall(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct netdev_rx_queue *rxq = &dev->_rx[i]; - struct pp_memory_provider_params *p = &rxq->mp_params; - if (p->mp_ops && p->mp_ops->uninstall) - p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + __netif_mp_uninstall_rxq(rxq, &rxq->mp_params); } } @@ -12378,6 +12384,12 @@ static void netif_close_many_and_unlock_cond(struct list_head *close_head) #endif } +bool unregister_netdevice_queued(const struct net_device *dev) +{ + ASSERT_RTNL(); + return !list_empty(&dev->unreg_list); +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { diff --git a/net/core/dev.h b/net/core/dev.h index 781619e76b3e..95edb2d4eff8 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -12,6 +12,7 @@ struct net; struct netlink_ext_ack; struct netdev_queue_config; struct cpumask; +struct pp_memory_provider_params; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ @@ -31,6 +32,8 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device *netdev_put_lock(struct net_device *dev, struct net *net, + netdevice_tracker *tracker); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); @@ -96,6 +99,15 @@ int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, struct netdev_queue_config *qcfg, struct netlink_ext_ack *extack); +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); + +void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq, + const struct pp_memory_provider_params *p); + +void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq, + struct netdev_rx_queue *virt_rxq); + /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; diff --git a/net/core/devmem.c b/net/core/devmem.c index 69d79aee07ef..cde4c89bc146 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -145,7 +145,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) rxq_idx = get_netdev_rx_queue_index(rxq); - __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); + netif_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } percpu_ref_kill(&binding->ref); @@ -163,7 +163,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, u32 xa_idx; int err; - err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack); if (err) return err; @@ -176,7 +176,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return 0; err_close_rxq: - __net_mp_close_rxq(dev, rxq_idx, &mp_params); + netif_mp_close_rxq(dev, rxq_idx, &mp_params); return err; } diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ba673e81716f..81aecb5d3bc5 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ +const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { + [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), + [NETDEV_A_LEASE_NETNS_ID] = NLA_POLICY_MIN(NLA_S32, 0), +}; + const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; +/* NETDEV_CMD_QUEUE_CREATE - do */ +static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_QUEUE_CREATE, + .doit = netdev_nl_queue_create_doit, + .policy = netdev_queue_create_nl_policy, + .maxattr = NETDEV_A_QUEUE_LEASE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cffc08517a41..d71b435d72c1 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,6 +14,7 @@ #include /* Common nested types */ +extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 470fabbeacd9..056460d01940 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -386,12 +386,63 @@ static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) return 0; } +static int +netdev_nl_queue_fill_lease(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type) +{ + struct net_device *orig_netdev = netdev; + struct nlattr *nest_lease, *nest_queue; + struct netdev_rx_queue *rxq; + struct net *net, *peer_net; + + rxq = __netif_get_rx_queue_lease(&netdev, &q_idx, + NETIF_PHYS_TO_VIRT); + if (!rxq || orig_netdev == netdev) + return 0; + + nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); + if (!nest_lease) + goto nla_put_failure; + + nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); + if (!nest_queue) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx)) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) + goto nla_put_failure; + nla_nest_end(rsp, nest_queue); + + if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, + READ_ONCE(netdev->ifindex))) + goto nla_put_failure; + + rcu_read_lock(); + peer_net = dev_net_rcu(netdev); + net = dev_net_rcu(orig_netdev); + if (!net_eq(net, peer_net)) { + s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); + + if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) + goto nla_put_failure_unlock; + } + rcu_read_unlock(); + nla_nest_end(rsp, nest_lease); + return 0; + +nla_put_failure_unlock: + rcu_read_unlock(); +nla_put_failure: + return -ENOMEM; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; - struct netdev_rx_queue *rxq; + struct net_device *orig_netdev = netdev; + struct netdev_rx_queue *rxq, *rxq_lease; struct netdev_queue *txq; void *hdr; @@ -409,17 +460,22 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, rxq = __netif_get_rx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; + if (netdev_nl_queue_fill_lease(rsp, netdev, q_idx, q_type)) + goto nla_put_failure; + rxq_lease = netif_get_rx_queue_lease_locked(&netdev, &q_idx); + if (rxq_lease) + rxq = rxq_lease; params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) - goto nla_put_failure; + goto nla_put_failure_lease; #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) - goto nla_put_failure; + goto nla_put_failure_lease; #endif - + netif_put_rx_queue_lease_locked(orig_netdev, netdev); break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); @@ -437,6 +493,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; +nla_put_failure_lease: + netif_put_rx_queue_lease_locked(orig_netdev, netdev); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; @@ -918,7 +976,8 @@ netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { struct device *rxq_dma_dev; - rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx, + NETDEV_QUEUE_TYPE_RX); if (dma_dev && rxq_dma_dev != dma_dev) { NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", rxq_idx, prev_rxq_idx); @@ -1095,7 +1154,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } - dma_dev = netdev_queue_get_dma_dev(netdev, 0); + dma_dev = netdev_queue_get_dma_dev(netdev, 0, NETDEV_QUEUE_TYPE_TX); binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { @@ -1120,6 +1179,173 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) return err; } +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) +{ + const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; + const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; + int err, ifindex, ifindex_lease, queue_id, queue_id_lease; + struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; + struct netdev_rx_queue *rxq, *rxq_lease; + struct net_device *dev, *dev_lease; + netdevice_tracker dev_tracker; + s32 netns_lease = -1; + struct nlattr *nest; + struct sk_buff *rsp; + struct net *net; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) + return -EINVAL; + if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != + NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + nest = info->attrs[NETDEV_A_QUEUE_LEASE]; + err = nla_parse_nested(ltb, lmaxtype, nest, + netdev_lease_nl_policy, info->extack); + if (err < 0) + return err; + if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || + NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) + return -EINVAL; + if (ltb[NETDEV_A_LEASE_NETNS_ID]) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + netns_lease = nla_get_s32(ltb[NETDEV_A_LEASE_NETNS_ID]); + } + + ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); + + nest = ltb[NETDEV_A_LEASE_QUEUE]; + err = nla_parse_nested(qtb, qmaxtype, nest, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + /* Locking order is always from the virtual to the physical device + * since this is also the same order when applications open the + * memory provider later on. + */ + dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!dev) { + err = -ENODEV; + goto err_genlmsg_free; + } + if (!netdev_can_create_queue(dev, info->extack)) { + err = -EINVAL; + goto err_unlock_dev; + } + + net = genl_info_net(info); + if (netns_lease >= 0) { + net = get_net_ns_by_id(net, netns_lease); + if (!net) { + err = -ENONET; + goto err_unlock_dev; + } + } + + dev_lease = netdev_get_by_index(net, ifindex_lease, &dev_tracker, + GFP_KERNEL); + if (!dev_lease) { + err = -ENODEV; + goto err_put_netns; + } + if (!netdev_can_lease_queue(dev_lease, info->extack)) { + netdev_put(dev_lease, &dev_tracker); + err = -EINVAL; + goto err_put_netns; + } + + dev_lease = netdev_put_lock(dev_lease, net, &dev_tracker); + if (!dev_lease) { + err = -ENODEV; + goto err_put_netns; + } + if (queue_id_lease >= dev_lease->real_num_rx_queues) { + err = -ERANGE; + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); + goto err_unlock_dev_lease; + } + if (netdev_queue_busy(dev_lease, queue_id_lease, NETDEV_QUEUE_TYPE_RX, + info->extack)) { + err = -EBUSY; + goto err_unlock_dev_lease; + } + + rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); + rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); + + /* Leasing queues from different physical devices is currently + * not supported. Capabilities such as XDP features and DMA + * device may differ between physical devices, and computing + * a correct intersection for the virtual device is not yet + * implemented. + */ + if (rxq->lease && rxq->lease->dev != dev_lease) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Leasing queues from different devices not supported"); + goto err_unlock_dev_lease; + } + + queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev, info->extack); + if (queue_id < 0) { + err = queue_id; + goto err_unlock_dev_lease; + } + rxq = __netif_get_rx_queue(dev, queue_id); + + netdev_rx_queue_lease(rxq, rxq_lease); + + nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); + genlmsg_end(rsp, hdr); + + netdev_unlock(dev_lease); + netdev_unlock(dev); + if (netns_lease >= 0) + put_net(net); + + return genlmsg_reply(rsp, info); + +err_unlock_dev_lease: + netdev_unlock(dev_lease); +err_put_netns: + if (netns_lease >= 0) + put_net(net); +err_unlock_dev: + netdev_unlock(dev); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c index 251f27a8307f..265161e12a9c 100644 --- a/net/core/netdev_queues.c +++ b/net/core/netdev_queues.c @@ -1,18 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include -/** - * netdev_queue_get_dma_dev() - get dma device for zero-copy operations - * @dev: net_device - * @idx: queue index - * - * Get dma device for zero-copy operations to be used for this queue. - * When such device is not available or valid, the function will return NULL. - * - * Return: Device or NULL on error - */ -struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +#include "dev.h" + +static struct device * +__netdev_queue_get_dma_dev(struct net_device *dev, unsigned int idx) { const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; struct device *dma_dev; @@ -25,3 +20,93 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; } +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * @type: queue type (RX or TX) + * + * Get dma device for zero-copy operations to be used for this queue. If + * the queue is an RX queue leased from a physical queue, we retrieve the + * physical queue's dma device. When the dma device is not available or + * valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, + unsigned int idx, + enum netdev_queue_type type) +{ + struct net_device *orig_dev = dev; + struct device *dma_dev; + + /* Only RX side supports queue leasing today. */ + if (type != NETDEV_QUEUE_TYPE_RX || !netif_rxq_is_leased(dev, idx)) + return __netdev_queue_get_dma_dev(dev, idx); + + if (!netif_get_rx_queue_lease_locked(&dev, &idx)) + return NULL; + + dma_dev = __netdev_queue_get_dma_dev(dev, idx); + netif_put_rx_queue_lease_locked(orig_dev, dev); + return dma_dev; +} + +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + if (dev->dev.parent) { + NL_SET_ERR_MSG(extack, "Device is not a virtual device"); + return false; + } + if (!dev->queue_mgmt_ops || + !dev->queue_mgmt_ops->ndo_queue_create) { + NL_SET_ERR_MSG(extack, "Device does not support queue creation"); + return false; + } + if (dev->real_num_rx_queues < 1 || + dev->real_num_tx_queues < 1) { + NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); + return false; + } + return true; +} + +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + if (!dev->dev.parent) { + NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); + return false; + } + if (!netif_device_present(dev)) { + NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); + return false; + } + if (!dev->queue_mgmt_ops) { + NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); + return false; + } + return true; +} + +bool netdev_queue_busy(struct net_device *dev, unsigned int idx, + enum netdev_queue_type type, + struct netlink_ext_ack *extack) +{ + if (xsk_get_pool_from_qid(dev, idx)) { + NL_SET_ERR_MSG(extack, "Device queue in use by AF_XDP"); + return true; + } + if (type == NETDEV_QUEUE_TYPE_TX) + return false; + if (netif_rxq_is_leased(dev, idx)) { + NL_SET_ERR_MSG(extack, "Device queue in use due to queue leasing"); + return true; + } + if (netif_rxq_has_mp(dev, idx)) { + NL_SET_ERR_MSG(extack, "Device queue in use by memory provider"); + return true; + } + return false; +} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 05fd2875d725..469319451ba2 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -10,15 +10,109 @@ #include "dev.h" #include "page_pool_priv.h" -/* See also page_pool_is_unreadable() */ -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src) { - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + netdev_assert_locked(rxq_src->dev); + netdev_assert_locked(rxq_dst->dev); - return !!rxq->mp_params.mp_ops; + netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); + + WRITE_ONCE(rxq_src->lease, rxq_dst); + WRITE_ONCE(rxq_dst->lease, rxq_src); +} + +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src) +{ + netdev_assert_locked(rxq_dst->dev); + netdev_assert_locked(rxq_src->dev); + + netif_rxq_cleanup_unlease(rxq_src, rxq_dst); + + WRITE_ONCE(rxq_src->lease, NULL); + WRITE_ONCE(rxq_dst->lease, NULL); + + netdev_put(rxq_src->dev, &rxq_src->lease_tracker); +} + +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); + return false; +} + +/* Virtual devices eligible for leasing have no dev->dev.parent, while + * physical devices always have one. Use this to enforce the correct + * lease traversal direction. + */ +static bool netif_lease_dir_ok(const struct net_device *dev, + enum netif_lease_dir dir) +{ + if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) + return true; + if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) + return true; + return false; +} + +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, + enum netif_lease_dir dir) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); + + if (rxq->lease) { + if (!netif_lease_dir_ok(orig_dev, dir)) + return NULL; + rxq = rxq->lease; + *rxq_idx = get_netdev_rx_queue_index(rxq); + *dev = rxq->dev; + } + return rxq; +} + +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq; + + /* Locking order is always from the virtual to the physical device + * see netdev_nl_queue_create_doit(). + */ + netdev_ops_assert_locked(orig_dev); + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); + if (rxq && orig_dev != *dev) + netdev_lock(*dev); + return rxq; +} + +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev) +{ + if (orig_dev != dev) + netdev_unlock(dev); +} + +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; + return false; } EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; + return false; +} + static int netdev_rx_queue_reconfig(struct net_device *dev, unsigned int rxq_idx, struct netdev_queue_config *qcfg_old, @@ -108,9 +202,9 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); -int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, - const struct pp_memory_provider_params *p, - struct netlink_ext_ack *extack) +static int __netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) { const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; struct netdev_queue_config qcfg[2]; @@ -120,12 +214,6 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, if (!qops) return -EOPNOTSUPP; - if (rxq_idx >= dev->real_num_rx_queues) { - NL_SET_ERR_MSG(extack, "rx queue index out of range"); - return -ERANGE; - } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); return -EINVAL; @@ -172,27 +260,48 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, - struct pp_memory_provider_params *p) +int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) { + struct net_device *orig_dev = dev; int ret; - netdev_lock(dev); - ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); - netdev_unlock(dev); + if (!netdev_need_ops_lock(dev)) + return -EOPNOTSUPP; + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + + if (!netif_rxq_is_leased(dev, rxq_idx)) + return __netif_mp_open_rxq(dev, rxq_idx, p, extack); + + if (!netif_get_rx_queue_lease_locked(&dev, &rxq_idx)) { + NL_SET_ERR_MSG(extack, "rx queue leased to a virtual netdev"); + return -EBUSY; + } + if (!dev->dev.parent) { + NL_SET_ERR_MSG(extack, "rx queue belongs to a virtual netdev"); + ret = -EOPNOTSUPP; + goto out; + } + + ret = __netif_mp_open_rxq(dev, rxq_idx, p, extack); +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, - const struct pp_memory_provider_params *old_p) +static void __netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) { struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) - return; - rxq = __netif_get_rx_queue(dev, ifq_idx); /* Callers holding a netdev ref may get here after we already @@ -214,10 +323,47 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, WARN_ON(err && err != -ENETDOWN); } -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *old_p) +void netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) { - netdev_lock(dev); - __net_mp_close_rxq(dev, ifq_idx, old_p); - netdev_unlock(dev); + struct net_device *orig_dev = dev; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + if (!netif_rxq_is_leased(dev, ifq_idx)) + return __netif_mp_close_rxq(dev, ifq_idx, old_p); + + if (WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &ifq_idx))) + return; + + __netif_mp_close_rxq(dev, ifq_idx, old_p); + netif_put_rx_queue_lease_locked(orig_dev, dev); +} + +void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq, + const struct pp_memory_provider_params *p) +{ + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(p->mp_priv, rxq); +} + +/* Clean up memory provider state when a queue lease is torn down. If + * a memory provider was installed on the physical queue via the lease, + * close it now. The memory provider is a property of the queue itself, + * and it was _guaranteed_ to be installed on the physical queue via + * the lease redirection. The extra __netif_mp_close_rxq is needed + * since the physical queue can outlive the virtual queue in the lease + * case, so it needs to be reconfigured to clear the memory provider. + */ +void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq, + struct netdev_rx_queue *virt_rxq) +{ + struct pp_memory_provider_params *p = &phys_rxq->mp_params; + unsigned int ifq_idx = get_netdev_rx_queue_index(phys_rxq); + + if (!p->mp_ops) + return; + + __netif_mp_uninstall_rxq(virt_rxq, p); + __netif_mp_close_rxq(phys_rxq->dev, ifq_idx, p); } diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 45232cf1c144..64ef8cff2005 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "common.h" #include "netlink.h" @@ -109,7 +109,7 @@ ethnl_set_channels_validate(struct ethnl_req_info *req_info, static int ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { - unsigned int from_channel, old_total, i; + unsigned int old_combined, old_rx, old_tx, i; bool mod = false, mod_combined = false; struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; @@ -118,8 +118,9 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) int ret; dev->ethtool_ops->get_channels(dev, &channels); - old_total = channels.combined_count + - max(channels.rx_count, channels.tx_count); + old_combined = channels.combined_count; + old_rx = channels.rx_count; + old_tx = channels.tx_count; ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], &mod); @@ -169,14 +170,19 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; - /* Disabling channels, query zero-copy AF_XDP sockets */ - from_channel = channels.combined_count + - min(channels.rx_count, channels.tx_count); - for (i = from_channel; i < old_total; i++) - if (xsk_get_pool_from_qid(dev, i)) { - GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); + /* ensure channels are not busy at the moment */ + for (i = channels.combined_count + channels.rx_count; + i < old_combined + old_rx; i++) { + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX, + info->extack)) return -EINVAL; - } + } + for (i = channels.combined_count + channels.tx_count; + i < old_combined + old_tx; i++) { + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX, + info->extack)) + return -EINVAL; + } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 3c713a91ad0d..bd97f9b9bf18 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -27,12 +27,12 @@ #include #include #include +#include #include #include -#include #include #include -#include +#include #include "common.h" @@ -2250,7 +2250,6 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; - u16 from_channel, to_channel; unsigned int i; int ret; @@ -2284,13 +2283,17 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (ret) return ret; - /* Disabling channels, query zero-copy AF_XDP sockets */ - from_channel = channels.combined_count + - min(channels.rx_count, channels.tx_count); - to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); - for (i = from_channel; i < to_channel; i++) - if (xsk_get_pool_from_qid(dev, i)) + /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ + for (i = channels.combined_count + channels.rx_count; + i < curr.combined_count + curr.rx_count; i++) { + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX, NULL)) return -EINVAL; + } + for (i = channels.combined_count + channels.tx_count; + i < curr.combined_count + curr.tx_count; i++) { + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX, NULL)) + return -EINVAL; + } ret = dev->ethtool_ops->set_channels(dev, &channels); if (!ret) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c8ef9e427c9c..60be6561f486 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,6 +23,8 @@ #include #include #include + +#include #include #include #include @@ -117,10 +119,18 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - if (queue_id < dev->num_rx_queues) - dev->_rx[queue_id].pool = NULL; - if (queue_id < dev->num_tx_queues) - dev->_tx[queue_id].pool = NULL; + struct net_device *orig_dev = dev; + unsigned int id = queue_id; + + if (id < dev->real_num_rx_queues) + WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); + + if (id < dev->num_rx_queues) + dev->_rx[id].pool = NULL; + if (id < dev->num_tx_queues) + dev->_tx[id].pool = NULL; + + netif_put_rx_queue_lease_locked(orig_dev, dev); } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do @@ -130,17 +140,30 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { - if (queue_id >= max_t(unsigned int, - dev->real_num_rx_queues, - dev->real_num_tx_queues)) + struct net_device *orig_dev = dev; + unsigned int id = queue_id; + int ret = 0; + + if (id >= max(dev->real_num_rx_queues, + dev->real_num_tx_queues)) return -EINVAL; - if (queue_id < dev->real_num_rx_queues) - dev->_rx[queue_id].pool = pool; - if (queue_id < dev->real_num_tx_queues) - dev->_tx[queue_id].pool = pool; + if (id < dev->real_num_rx_queues) { + if (!netif_get_rx_queue_lease_locked(&dev, &id)) + return -EBUSY; + if (xsk_get_pool_from_qid(dev, id)) { + ret = -EBUSY; + goto out; + } + } - return 0; + if (id < dev->real_num_rx_queues) + dev->_rx[id].pool = pool; + if (id < dev->real_num_tx_queues) + dev->_tx[id].pool = pool; +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); + return ret; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, @@ -330,12 +353,36 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } +static bool xsk_dev_queue_valid(const struct xdp_sock *xs, + const struct xdp_rxq_info *info) +{ + struct net_device *dev = xs->dev; + u32 queue_index = xs->queue_id; + struct netdev_rx_queue *rxq; + + if (info->dev == dev && + info->queue_index == queue_index) + return true; + + if (queue_index < dev->real_num_rx_queues) { + rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); + if (!rxq) + return false; + + dev = rxq->dev; + queue_index = get_netdev_rx_queue_index(rxq); + + return info->dev == dev && + info->queue_index == queue_index; + } + return false; +} + static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + if (!xsk_dev_queue_valid(xs, xdp->rxq)) return -EINVAL; if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index deeca3f8d080..28d245e11bc4 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -35,6 +35,7 @@ TEST_PROGS = \ loopback.sh \ nic_timestamp.py \ nk_netns.py \ + nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index df4da5078c48..84a4dab6c649 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -20,7 +20,7 @@ try: # Import one by one to avoid pylint false positives from net.lib.py import NetNS, NetNSEnter, NetdevSimDev from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ - NlError, RtnlFamily, DevlinkFamily, PSPFamily + NlError, RtnlFamily, DevlinkFamily, PSPFamily, Netlink from net.lib.py import CmdExitFailure from net.lib.py import bkg, cmd, bpftool, bpftrace, defer, ethtool, \ fd_read_timeout, ip, rand_port, rand_ports, wait_port_listen, \ @@ -36,7 +36,7 @@ try: __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", - "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", + "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", "Netlink", "CmdExitFailure", "bkg", "cmd", "bpftool", "bpftrace", "defer", "ethtool", "fd_read_timeout", "ip", "rand_port", "rand_ports", diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py new file mode 100755 index 000000000000..2bc5ffe96c7d --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py @@ -0,0 +1,1407 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import errno +import re +import time +import threading +from os import path +from lib.py import ( + ksft_run, + ksft_exit, + ksft_eq, + ksft_ne, + ksft_in, + ksft_not_in, + ksft_raises, +) +from lib.py import ( + NetDrvContEnv, + NetNS, + NetNSEnter, + EthtoolFamily, + NetdevFamily, + RtnlFamily, + NetdevSimDev, +) +from lib.py import ( + NlError, + Netlink, + bkg, + cmd, + defer, + ethtool, + ip, + rand_port, + wait_port_listen, +) +from lib.py import KsftSkipEx, CmdExitFailure + + +def set_flow_rule(cfg): + output = ethtool( + f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}" + ).stdout + values = re.search(r"ID (\d+)", output).group(1) + return int(values) + + +def create_netkit(rxqueues): + all_links = ip("-d link show", json=True) + old_idxs = { + link["ifindex"] + for link in all_links + if link.get("linkinfo", {}).get("info_kind") == "netkit" + } + + rtnl = RtnlFamily() + rtnl.newlink( + { + "linkinfo": { + "kind": "netkit", + "data": { + "mode": "l2", + "policy": "forward", + "peer-policy": "forward", + }, + }, + "num-rx-queues": rxqueues, + }, + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], + ) + + all_links = ip("-d link show", json=True) + nk_links = [ + link + for link in all_links + if link.get("linkinfo", {}).get("info_kind") == "netkit" + and link["ifindex"] not in old_idxs + ] + nk_links.sort(key=lambda x: x["ifindex"]) + return ( + nk_links[1]["ifname"], + nk_links[1]["ifindex"], + nk_links[0]["ifname"], + nk_links[0]["ifindex"], + ) + + +def create_netkit_single(rxqueues): + rtnl = RtnlFamily() + rtnl.newlink( + { + "linkinfo": { + "kind": "netkit", + "data": { + "mode": "l2", + "pairing": "single", + }, + }, + "num-rx-queues": rxqueues, + }, + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], + ) + + all_links = ip("-d link show", json=True) + nk_links = [ + link + for link in all_links + if link.get("linkinfo", {}).get("info_kind") == "netkit" + and "UP" not in link.get("flags", []) + ] + return nk_links[0]["ifname"], nk_links[0]["ifindex"] + + +def test_remove_phys(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + nk_queue_id = result["id"] + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["ifindex"], nk_guest_idx) + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) + + nsimdev.remove() + time.sleep(0.1) + ret = cmd(f"ip link show dev {nk_host}", fail=False) + ksft_ne(ret.ret, 0) + + +def test_double_lease(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3) + defer(cmd, f"ip link del dev {nk_host}") + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(result["id"], 1) + + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EBUSY) + + +def test_virtual_lessor(netns) -> None: + nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host_a}") + ip(f"link set dev {nk_host_a} up") + ip(f"link set dev {nk_guest_a} up") + + nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host_b}") + + ip(f"link set dev {nk_guest_b} netns {netns.name}") + ip(f"link set dev {nk_host_b} up") + ip(f"link set dev {nk_guest_b} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_b_idx, + "type": "rx", + "lease": { + "ifindex": nk_guest_a_idx, + "queue": {"id": 0, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_phys_lessee(_netns) -> None: + nsimdev_a = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev_a.remove) + nsim_a = nsimdev_a.nsims[0] + ip(f"link set dev {nsim_a.ifname} up") + + nsimdev_b = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev_b.remove) + nsim_b = nsimdev_b.nsims[0] + ip(f"link set dev {nsim_b.ifname} up") + + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nsim_a.ifindex, + "type": "rx", + "lease": { + "ifindex": nsim_b.ifindex, + "queue": {"id": 0, "type": "rx"}, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_different_lessors(netns) -> None: + nsimdev_a = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev_a.remove) + nsim_a = nsimdev_a.nsims[0] + ip(f"link set dev {nsim_a.ifname} up") + + nsimdev_b = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev_b.remove) + nsim_b = nsimdev_b.nsims[0] + ip(f"link set dev {nsim_b.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim_a.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim_b.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP) + + +def test_queue_out_of_range(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 2, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.ERANGE) + + +def test_resize_leased(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + ethnl = EthtoolFamily() + with ksft_raises(NlError) as e: + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_self_lease(_netns) -> None: + nk_host, _, _, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nk_guest_idx, + "queue": {"id": 0, "type": "rx"}, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_veth_queue_create(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + ip("link add veth0 type veth peer name veth1") + defer(cmd, "ip link del dev veth0", fail=False) + + all_links = ip("-d link show", json=True) + veth_peer = [ + link + for link in all_links + if link.get("ifname") == "veth1" + ] + veth_peer_idx = veth_peer[0]["ifindex"] + + ip(f"link set dev veth1 netns {netns.name}") + ip("link set dev veth0 up") + ip("link set dev veth1 up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": veth_peer_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_create_tx_type(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "tx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_create_primary(_netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, nk_host_idx, _, _ = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_host} up") + + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_host_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP) + + +def test_create_limit(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=1) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_link_flap_phys(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}") + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + nk_queue_id = result["id"] + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) + + # Link flap the physical device + ip(f"link set dev {nsim.ifname} down") + ip(f"link set dev {nsim.ifname} up") + + # Verify lease survives the flap + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) + + +def test_queue_get_virtual(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}") + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + nk_queue_id = result["id"] + + # queue-get on virtual device's leased queue should not show lease + # info (lease info is only shown from the physical device's side) + queue_info = netdevnl.queue_get( + {"ifindex": nk_guest_idx, "id": nk_queue_id, "type": "rx"} + ) + ksft_eq(queue_info["id"], nk_queue_id) + ksft_eq(queue_info["ifindex"], nk_guest_idx) + ksft_not_in("lease", queue_info) + + # Default queue (not leased) also has no lease info + queue_info = netdevnl.queue_get( + {"ifindex": nk_guest_idx, "id": 0, "type": "rx"} + ) + ksft_not_in("lease", queue_info) + + +def test_remove_virt_first(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(result["id"], 1) + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) + + # Delete netkit (virtual device removed first, physical stays) + cmd(f"ip link del dev {nk_host}") + + # Verify lease is cleaned up on physical device + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_not_in("lease", queue_info) + + +def test_multiple_leases(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=3) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=4) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + r1 = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + r2 = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 2, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + ksft_eq(r1["id"], 1) + ksft_eq(r2["id"], 2) + + # Verify both leases visible on physical device + netdevnl = NetdevFamily() + q1 = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} + ) + q2 = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 2, "type": "rx"} + ) + ksft_in("lease", q1) + ksft_in("lease", q2) + ksft_eq(q1["lease"]["ifindex"], nk_guest_idx) + ksft_eq(q2["lease"]["ifindex"], nk_guest_idx) + ksft_eq(q1["lease"]["queue"]["id"], r1["id"]) + ksft_eq(q2["lease"]["queue"]["id"], r2["id"]) + + +def test_lease_queue_tx_type(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "tx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + +def test_invalid_netns(netns) -> None: + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": 1, + "queue": {"id": 0, "type": "rx"}, + "netns-id": 999, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.ENONET) + + +def test_invalid_phys_ifindex(netns) -> None: + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + with ksft_raises(NlError) as e: + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": 99999, + "queue": {"id": 0, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(e.exception.nl_msg.error, -errno.ENODEV) + + +def test_multi_netkit_remove_phys(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=3) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + # Create two netkit pairs, each leasing a different physical queue + nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host_a}", fail=False) + + nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host_b}", fail=False) + + ip(f"link set dev {nk_guest_a} netns {netns.name}") + ip(f"link set dev {nk_host_a} up") + ip(f"link set dev {nk_guest_a} up", ns=netns) + + ip(f"link set dev {nk_guest_b} netns {netns.name}") + ip(f"link set dev {nk_host_b} up") + ip(f"link set dev {nk_guest_b} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_a_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + netdevnl.queue_create( + { + "ifindex": nk_guest_b_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 2, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + # Removing the physical device should take down both netkit pairs + nsimdev.remove() + time.sleep(0.1) + ret = cmd(f"ip link show dev {nk_host_a}", fail=False) + ksft_ne(ret.ret, 0) + ret = cmd(f"ip link show dev {nk_host_b}", fail=False) + ksft_ne(ret.ret, 0) + + +def test_single_remove_phys(_netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_name, nk_idx = create_netkit_single(rxqueues=2) + defer(cmd, f"ip link del dev {nk_name}", fail=False) + + ip(f"link set dev {nk_name} up") + + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + }, + } + ) + + # Removing the physical device should take down the single netkit device + nsimdev.remove() + time.sleep(0.1) + ret = cmd(f"ip link show dev {nk_name}", fail=False) + ksft_ne(ret.ret, 0) + + +def test_link_flap_virt(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}") + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + src_queue = 1 + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + nk_queue_id = result["id"] + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) + + # Link flap the virtual (netkit) device + ip(f"link set dev {nk_guest} down", ns=netns) + ip(f"link set dev {nk_guest} up", ns=netns) + + # Verify lease survives the virtual device flap + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) + + +def test_phys_queue_no_lease(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}") + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + # Physical queue 0 (not leased) should have no lease info + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 0, "type": "rx"} + ) + ksft_not_in("lease", queue_info) + + # Physical queue 1 (leased) should have lease info + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} + ) + ksft_in("lease", queue_info) + + +def test_same_ns_lease(_netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_name, nk_idx = create_netkit_single(rxqueues=2) + defer(cmd, f"ip link del dev {nk_name}", fail=False) + + ip(f"link set dev {nk_name} up") + + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + }, + } + ) + ksft_eq(result["id"], 1) + + # Same namespace: lease info should NOT have netns-id + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["ifindex"], nk_idx) + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) + ksft_not_in("netns-id", queue_info["lease"]) + + +def test_resize_after_unlease(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 1, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + # Resize should fail while lease is active + ethnl = EthtoolFamily() + with ksft_raises(NlError) as e: + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + + # Delete netkit, clearing the lease + cmd(f"ip link del dev {nk_host}") + + # Resize should now succeed + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) + + +def test_lease_queue_zero(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": 0, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(result["id"], 1) + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": 0, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) + + +def test_release_and_reuse(netns) -> None: + nsimdev = NetdevSimDev(port_count=1, queue_count=2) + defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + ip(f"link set dev {nsim.ifname} up") + + src_queue = 1 + + # First lease + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + + # Delete netkit, freeing the lease + cmd(f"ip link del dev {nk_host}") + + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_not_in("lease", queue_info) + + # Re-create netkit and lease the same physical queue again + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) + defer(cmd, f"ip link del dev {nk_host}", fail=False) + + ip(f"link set dev {nk_guest} netns {netns.name}") + ip(f"link set dev {nk_host} up") + ip(f"link set dev {nk_guest} up", ns=netns) + + with NetNSEnter(str(netns)): + netdevnl = NetdevFamily() + result = netdevnl.queue_create( + { + "ifindex": nk_guest_idx, + "type": "rx", + "lease": { + "ifindex": nsim.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + ksft_eq(result["id"], 1) + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} + ) + ksft_in("lease", queue_info) + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) + + +def test_iou_zcrx(cfg) -> None: + cfg.require_ipver("6") + ethnl = EthtoolFamily() + + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) + rx_rings = rings["rx"] + hds_thresh = rings.get("hds-thresh", 0) + + ethnl.rings_set( + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "enabled", + "hds-thresh": 0, + "rx": 64, + } + ) + defer( + ethnl.rings_set, + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "unknown", + "hds-thresh": hds_thresh, + "rx": rx_rings, + }, + ) + + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = set_flow_rule(cfg) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + cmd(tx_cmd, host=cfg.remote) + + +def test_attrs(cfg) -> None: + cfg.require_ipver("6") + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + + ksft_eq(queue_info["id"], cfg.src_queue) + ksft_eq(queue_info["type"], "rx") + ksft_eq(queue_info["ifindex"], cfg.ifindex) + + ksft_in("lease", queue_info) + lease = queue_info["lease"] + ksft_eq(lease["ifindex"], cfg.nk_guest_ifindex) + ksft_eq(lease["queue"]["id"], cfg.nk_queue) + ksft_eq(lease["queue"]["type"], "rx") + ksft_in("netns-id", lease) + + +def test_attach_xdp_with_mp(cfg) -> None: + cfg.require_ipver("6") + ethnl = EthtoolFamily() + + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) + rx_rings = rings["rx"] + hds_thresh = rings.get("hds-thresh", 0) + + ethnl.rings_set( + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "enabled", + "hds-thresh": 0, + "rx": 64, + } + ) + defer( + ethnl.rings_set, + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "unknown", + "hds-thresh": hds_thresh, + "rx": rx_rings, + }, + ) + + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + netdevnl = NetdevFamily() + + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + with bkg(rx_cmd): + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + + time.sleep(0.1) + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + ksft_in("io-uring", queue_info) + + prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" + with ksft_raises(CmdExitFailure): + ip(f"link set dev {cfg.ifname} xdp obj {prog} sec xdp.frags") + + time.sleep(0.1) + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + ksft_not_in("io-uring", queue_info) + + +def test_destroy(cfg) -> None: + cfg.require_ipver("6") + ethnl = EthtoolFamily() + + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) + rx_rings = rings["rx"] + hds_thresh = rings.get("hds-thresh", 0) + + ethnl.rings_set( + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "enabled", + "hds-thresh": 0, + "rx": 64, + } + ) + defer( + ethnl.rings_set, + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "unknown", + "hds-thresh": hds_thresh, + "rx": rx_rings, + }, + ) + + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + rx_proc = cmd(rx_cmd, background=True) + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + + netdevnl = NetdevFamily() + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + ksft_in("io-uring", queue_info) + + # ip link del will wait for all refs to drop first, but iou-zcrx is holding + # onto a ref. Terminate iou-zcrx async via a thread after a delay. + kill_timer = threading.Timer(1, rx_proc.proc.terminate) + kill_timer.start() + + ip(f"link del dev {cfg._nk_host_ifname}") + kill_timer.join() + cfg._nk_host_ifname = None + cfg._nk_guest_ifname = None + + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + ksft_not_in("io-uring", queue_info) + + cmd(f"tc filter del dev {cfg.ifname} ingress pref {cfg._bpf_prog_pref}") + cfg._tc_attached = False + + flow_rule_id = set_flow_rule(cfg) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.src_queue}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(cfg.port, proto="tcp") + cmd(tx_cmd, host=cfg.remote) + # Short delay since iou cleanup is async and takes a bit of time. + time.sleep(0.1) + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + ) + ksft_not_in("io-uring", queue_info) + + +def main() -> None: + netns = NetNS() + cmd("ip netns attach init 1") + ip("netns set init 0", ns=netns) + ip("link set lo up", ns=netns) + + ksft_run( + [ + test_remove_phys, + test_double_lease, + test_virtual_lessor, + test_phys_lessee, + test_different_lessors, + test_queue_out_of_range, + test_resize_leased, + test_self_lease, + test_create_tx_type, + test_create_primary, + test_create_limit, + test_link_flap_phys, + test_queue_get_virtual, + test_remove_virt_first, + test_multiple_leases, + test_lease_queue_tx_type, + test_invalid_netns, + test_invalid_phys_ifindex, + test_multi_netkit_remove_phys, + test_single_remove_phys, + test_link_flap_virt, + test_phys_queue_no_lease, + test_same_ns_lease, + test_resize_after_unlease, + test_lease_queue_zero, + test_release_and_reuse, + test_veth_queue_create, + ], + args=(netns,), + ) + + cmd("ip netns del init", fail=False) + del netns + + with NetDrvContEnv(__file__, rxqueues=2) as cfg: + cfg.bin_local = path.abspath( + path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx" + ) + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + cfg.port = rand_port() + + ethnl = EthtoolFamily() + channels = ethnl.channels_get({"header": {"dev-index": cfg.ifindex}}) + channels = channels["combined-count"] + if channels < 2: + raise KsftSkipEx("Test requires NETIF with at least 2 combined channels") + + cfg.src_queue = channels - 1 + + with NetNSEnter(str(cfg.netns)): + netdevnl = NetdevFamily() + bind_result = netdevnl.queue_create( + { + "ifindex": cfg.nk_guest_ifindex, + "type": "rx", + "lease": { + "ifindex": cfg.ifindex, + "queue": {"id": cfg.src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + cfg.nk_queue = bind_result["id"] + + # test_destroy must be last because it destroys the netkit devices + ksft_run( + [test_iou_zcrx, test_attrs, test_attach_xdp_with_mp, test_destroy], + args=(cfg,), + ) + ksft_exit() + + +if __name__ == "__main__": + main()