From df6cb25f07794b39e7993938ee3ca6749a88f300 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 30 Oct 2024 09:00:02 +0800 Subject: [PATCH 01/16] selftests: netfilter: Add missing gitignore file Compiled binary files should be added to .gitignore 'git status' complains: Untracked files: (use "git add ..." to include in what will be committed) net/netfilter/conntrack_reverse_clash Signed-off-by: Li Zhijian Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/netfilter/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore index 0a64d6d0e29a..64c4f8d9aa6c 100644 --- a/tools/testing/selftests/net/netfilter/.gitignore +++ b/tools/testing/selftests/net/netfilter/.gitignore @@ -2,5 +2,6 @@ audit_logread connect_close conntrack_dump_flush +conntrack_reverse_clash sctp_collision nf_queue From 041bd1e4f2d82859690cd8b41c35f0f9404c3770 Mon Sep 17 00:00:00 2001 From: guanjing Date: Fri, 8 Nov 2024 16:13:58 +0800 Subject: [PATCH 02/16] selftests: netfilter: Fix missing return values in conntrack_dump_flush Fix the bug of some functions were missing return values. Fixes: eff3c558bb7e ("netfilter: ctnetlink: support filtering by zone") Signed-off-by: Guan Jing Signed-off-by: Pablo Neira Ayuso --- .../testing/selftests/net/netfilter/conntrack_dump_flush.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c index 254ff03297f0..5f827e10717d 100644 --- a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -43,6 +43,8 @@ static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, mnl_attr_nest_end(nlh, nest_proto); mnl_attr_nest_end(nlh, nest); + + return 0; } static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, @@ -71,6 +73,8 @@ static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, mnl_attr_nest_end(nlh, nest_proto); mnl_attr_nest_end(nlh, nest); + + return 0; } static int build_cta_proto(struct nlmsghdr *nlh) @@ -90,6 +94,8 @@ static int build_cta_proto(struct nlmsghdr *nlh) mnl_attr_nest_end(nlh, nest_proto); mnl_attr_nest_end(nlh, nest); + + return 0; } static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, From 35f56c554eb1b56b77b3cf197a6b00922d49033d Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Wed, 13 Nov 2024 22:02:09 +0900 Subject: [PATCH 03/16] netfilter: ipset: add missing range check in bitmap_ip_uadt When tb[IPSET_ATTR_IP_TO] is not present but tb[IPSET_ATTR_CIDR] exists, the values of ip and ip_to are slightly swapped. Therefore, the range check for ip should be done later, but this part is missing and it seems that the vulnerability occurs. So we should add missing range checks and remove unnecessary range checks. Cc: Reported-by: syzbot+58c872f7790a4d2ac951@syzkaller.appspotmail.com Fixes: 72205fc68bd1 ("netfilter: ipset: bitmap:ip set type support") Signed-off-by: Jeongjun Park Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e4fa00abde6a..5988b9bb9029 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) { + if (ip > ip_to) swap(ip, ip_to); - if (ip < map->first_ip) - return -IPSET_ERR_BITMAP_RANGE; - } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ip_to = ip; } - if (ip_to > map->last_ip) + if (ip < map->first_ip || ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { From 378e8feea9a70d37a5dc1678b7ec27df21099fa5 Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 12 Nov 2024 15:06:08 +0100 Subject: [PATCH 04/16] net: phy: dp83869: fix status reporting for 1000base-x autonegotiation The DP83869 PHY transceiver supports converting from RGMII to 1000base-x. In this operation mode, autonegotiation can be performed, as described in IEEE802.3. The DP83869 has a set of fiber-specific registers located at offset 0xc00. When the transceiver is configured in RGMII-to-1000base-x mode, these registers are mapped onto offset 0, which should make reading the autonegotiation status transparent. However, the fiber registers at offset 0xc04 and 0xc05 follow the bit layout specified in Clause 37, and genphy_read_status() assumes a Clause 22 layout. Thus, genphy_read_status() doesn't properly read the capabilities advertised by the link partner, resulting in incorrect link parameters. Similarly, genphy_config_aneg() doesn't properly write advertised capabilities. Fix the 1000base-x autonegotiation procedure by replacing genphy_read_status() and genphy_config_aneg() with their Clause 37 equivalents. Fixes: a29de52ba2a1 ("net: dp83869: Add ability to advertise Fiber connection") Cc: stable@vger.kernel.org Signed-off-by: Romain Gantois Link: https://patch.msgid.link/20241112-dp83869-1000base-x-v3-1-36005f4ab0d9@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83869.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index 5f056d7db83e..b6b38caf9c0e 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -153,19 +153,32 @@ struct dp83869_private { int mode; }; +static int dp83869_config_aneg(struct phy_device *phydev) +{ + struct dp83869_private *dp83869 = phydev->priv; + + if (dp83869->mode != DP83869_RGMII_1000_BASE) + return genphy_config_aneg(phydev); + + return genphy_c37_config_aneg(phydev); +} + static int dp83869_read_status(struct phy_device *phydev) { struct dp83869_private *dp83869 = phydev->priv; + bool changed; int ret; + if (dp83869->mode == DP83869_RGMII_1000_BASE) + return genphy_c37_read_status(phydev, &changed); + ret = genphy_read_status(phydev); if (ret) return ret; - if (linkmode_test_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, phydev->supported)) { + if (dp83869->mode == DP83869_RGMII_100_BASE) { if (phydev->link) { - if (dp83869->mode == DP83869_RGMII_100_BASE) - phydev->speed = SPEED_100; + phydev->speed = SPEED_100; } else { phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; @@ -898,6 +911,7 @@ static int dp83869_phy_reset(struct phy_device *phydev) .soft_reset = dp83869_phy_reset, \ .config_intr = dp83869_config_intr, \ .handle_interrupt = dp83869_handle_interrupt, \ + .config_aneg = dp83869_config_aneg, \ .read_status = dp83869_read_status, \ .get_tunable = dp83869_get_tunable, \ .set_tunable = dp83869_set_tunable, \ From ea301aec8bb718b02b68761d2229fc12c9fefa29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Gro=C3=9Fe?= Date: Wed, 13 Nov 2024 13:07:04 -0800 Subject: [PATCH 05/16] i40e: Fix handling changed priv flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After assembling the new private flags on a PF, the operation to determine the changed flags uses the wrong bitmaps. Instead of xor-ing orig_flags with new_flags, it uses the still unchanged pf->flags, thus changed_flags is always 0. Fix it by using the correct bitmaps. The issue was discovered while debugging why disabling source pruning stopped working with release 6.7. Although the new flags will be copied to pf->flags later on in that function, disabling source pruning requires a reset of the PF, which was skipped due to this bug. Disabling source pruning: $ sudo ethtool --set-priv-flags eno1 disable-source-pruning on $ sudo ethtool --show-priv-flags eno1 Private flags for eno1: MFP : off total-port-shutdown : off LinkPolling : off flow-director-atr : on veb-stats : off hw-atr-eviction : off link-down-on-close : off legacy-rx : off disable-source-pruning: on disable-fw-lldp : off rs-fec : off base-r-fec : off vf-vlan-pruning : off Regarding reproducing: I observed the issue with a rather complicated lab setup, where * two VLAN interfaces are created on eno1 * each with a different MAC address assigned * each moved into a separate namespace * both VLANs are bridged externally, so they form a single layer 2 network The external bridge is done via a channel emulator adding packet loss and delay and the application in the namespaces tries to send/receive traffic and measure the performance. Sender and receiver are separated by namespaces, yet the network card "sees its own traffic" send back to it. To make that work, source pruning has to be disabled. Cc: stable@vger.kernel.org Fixes: 70756d0a4727 ("i40e: Use DECLARE_BITMAP for flags and hw_features fields in i40e_pf") Signed-off-by: Peter Große Reviewed-by: Paul Menzel Reviewed-by: Przemek Kitszel Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20241113210705.1296408-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index f2506511bbff..bce5b76f1e7a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -5299,7 +5299,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) } flags_complete: - bitmap_xor(changed_flags, pf->flags, orig_flags, I40E_PF_FLAGS_NBITS); + bitmap_xor(changed_flags, new_flags, orig_flags, I40E_PF_FLAGS_NBITS); if (test_bit(I40E_FLAG_FW_LLDP_DIS, changed_flags)) reset_needed = I40E_PF_RESET_AND_REBUILD_FLAG; From c53bf100f68619acf6cedcf4cf5249a1ca2db0b4 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 14 Nov 2024 17:51:56 +0000 Subject: [PATCH 06/16] netdev-genl: Hold rcu_read_lock in napi_get Hold rcu_read_lock in netdev_nl_napi_get_doit, which calls napi_by_id and is required to be called under rcu_read_lock. Cc: stable@vger.kernel.org Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Joe Damato Link: https://patch.msgid.link/20241114175157.16604-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1cb954f2d39e..d2baa1af9df0 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -215,6 +215,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; rtnl_lock(); + rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { @@ -224,6 +225,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) err = -ENOENT; } + rcu_read_unlock(); rtnl_unlock(); if (err) From 0c0d0f42ffa6ac94cd79893b7ed419c15e1b45de Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Thu, 14 Nov 2024 12:30:05 +0100 Subject: [PATCH 07/16] xsk: Free skb when TX metadata options are invalid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a new skb is allocated for transmitting an xsk descriptor, i.e., for every non-multibuf descriptor or the first frag of a multibuf descriptor, but the descriptor is later found to have invalid options set for the TX metadata, the new skb is never freed. This can leak skbs until the send buffer is full which makes sending more packets impossible. Fix this by freeing the skb in the error path if we are currently dealing with the first frag, i.e., an skb allocated in this iteration of xsk_build_skb. Fixes: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support") Reported-by: Michal Schmidt Signed-off-by: Felix Maurer Reviewed-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/edb9b00fb19e680dff5a3350cd7581c5927975a8.1731581697.git.fmaurer@redhat.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120ca..b57d5d2904eb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -675,6 +675,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { + first_frag = true; + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -685,12 +687,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); + if (unlikely(err)) goto free_err; - } - - first_frag = true; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; @@ -758,6 +756,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: + if (first_frag && skb) + kfree_skb(skb); + if (err == -EOVERFLOW) { /* Drop the packet */ xsk_set_destructor_arg(xs->skb); From 41ffcd95015f18178ddc53fed919c2842d52fe38 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 14 Nov 2024 10:33:27 +0000 Subject: [PATCH 08/16] net: phy: fix phylib's dual eee_enabled phylib has two eee_enabled members. Some parts of the code are using phydev->eee_enabled, other parts are using phydev->eee_cfg.eee_enabled. This leads to incorrect behaviour as their state goes out of sync. ethtool --show-eee shows incorrect information, and --set-eee sometimes doesn't take effect. Fix this by only having one eee_enabled member - that in eee_cfg. Fixes: 49168d1980e2 ("net: phy: Add phy_support_eee() indicating MAC support EEE") Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tBXAF-00341F-EQ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 4 +--- drivers/net/phy/phy_device.c | 4 ++-- include/linux/phy.h | 2 -- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 5695935fdce9..ac987e5e82dc 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -942,7 +942,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_read_eee_abilities); */ int genphy_c45_an_config_eee_aneg(struct phy_device *phydev) { - if (!phydev->eee_enabled) { + if (!phydev->eee_cfg.eee_enabled) { __ETHTOOL_DECLARE_LINK_MODE_MASK(adv) = {}; return genphy_c45_write_eee_adv(phydev, adv); @@ -1575,8 +1575,6 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev, linkmode_copy(phydev->advertising_eee, adv); } - phydev->eee_enabled = data->eee_enabled; - ret = genphy_c45_an_config_eee_aneg(phydev); if (ret > 0) { ret = phy_restart_aneg(phydev); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 499797646580..5dfa2aa53c90 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3595,12 +3595,12 @@ static int phy_probe(struct device *dev) /* There is no "enabled" flag. If PHY is advertising, assume it is * kind of enabled. */ - phydev->eee_enabled = !linkmode_empty(phydev->advertising_eee); + phydev->eee_cfg.eee_enabled = !linkmode_empty(phydev->advertising_eee); /* Some PHYs may advertise, by default, not support EEE modes. So, * we need to clean them. */ - if (phydev->eee_enabled) + if (phydev->eee_cfg.eee_enabled) linkmode_and(phydev->advertising_eee, phydev->supported_eee, phydev->advertising_eee); diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde..44890cdf40a2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -601,7 +601,6 @@ struct macsec_ops; * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes - * @eee_enabled: Flag indicating whether the EEE feature is enabled * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes @@ -721,7 +720,6 @@ struct phy_device { /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); - bool eee_enabled; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); From 8ffade77b6337a8767fae9820d57d7a6413dd1a1 Mon Sep 17 00:00:00 2001 From: Ziwei Xiao Date: Wed, 13 Nov 2024 09:59:30 -0800 Subject: [PATCH 09/16] gve: Flow steering trigger reset only for timeout error When configuring flow steering rules, the driver is currently going through a reset for all errors from the device. Instead, the driver should only reset when there's a timeout error from the device. Fixes: 57718b60df9b ("gve: Add flow steering adminq commands") Cc: stable@vger.kernel.org Signed-off-by: Ziwei Xiao Signed-off-by: Jeroen de Borst Reviewed-by: Harshitha Ramamurthy Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241113175930.2585680-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_adminq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index e44e8b139633..060e0e674938 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1248,10 +1248,10 @@ gve_adminq_configure_flow_rule(struct gve_priv *priv, sizeof(struct gve_adminq_configure_flow_rule), flow_rule_cmd); - if (err) { + if (err == -ETIME) { dev_err(&priv->pdev->dev, "Timeout to configure the flow rule, trigger reset"); gve_reset(priv, true); - } else { + } else if (!err) { priv->flow_rules_cache.rules_cache_synced = false; } From 62e9c00ea868ceb71156c517747dc69316c25bf1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Nov 2024 17:48:09 -0800 Subject: [PATCH 10/16] eth: fbnic: don't disable the PCI device twice We use pcim_enable_device(), there is no need to call pci_disable_device(). Fixes: 546dd90be979 ("eth: fbnic: Add scaffolding for Meta's NIC driver") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241115014809.754860-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index a4809fe0fc24..268489b15616 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -319,7 +319,6 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) free_irqs: fbnic_free_irqs(fbd); free_fbd: - pci_disable_device(pdev); fbnic_devlink_free(fbd); return err; @@ -349,7 +348,6 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_fw_disable_mbx(fbd); fbnic_free_irqs(fbd); - pci_disable_device(pdev); fbnic_devlink_free(fbd); } From 2160428bcb20f2f70a72ee84aba91a1264dc4ff3 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 15 Nov 2024 15:35:08 +0800 Subject: [PATCH 11/16] net: txgbe: fix null pointer to pcs For 1000BASE-X or SGMII interface mode, the PCS also need to be selected. Only return null pointer when there is a copper NIC with external PHY. Fixes: 02b2a6f91b90 ("net: txgbe: support copper NIC with external PHY") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20241115073508.1130046-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 67b61afdde96..2e666f65aa82 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -162,7 +162,7 @@ static struct phylink_pcs *txgbe_phylink_mac_select(struct phylink_config *confi struct wx *wx = phylink_to_wx(config); struct txgbe *txgbe = wx->priv; - if (interface == PHY_INTERFACE_MODE_10GBASER) + if (wx->media_type != sp_media_copper) return &txgbe->xpcs->pcs; return NULL; From 4262bacb748fdab129dfbe1e93af75119a9c2775 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 Nov 2024 11:56:09 -0800 Subject: [PATCH 12/16] MAINTAINERS: exclude can core, drivers and DT bindings from netdev ML CAN networking and drivers are maintained by Marc, Oliver and Vincent. Marc sends us already pull requests with reviewed and validated code. Exclude the CAN patch postings from the netdev@ mailing list to lower the patch volume there. Link: https://lore.kernel.org/20241113193709.395c18b0@kernel.org Acked-by: Vincent Mailhol Acked-by: Marc Kleine-Budde Link: https://patch.msgid.link/20241115195609.981049-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b878ddc99f94..8089a69f69e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16068,7 +16068,9 @@ F: include/uapi/linux/if_* F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ X: Documentation/devicetree/bindings/net/bluetooth/ +X: Documentation/devicetree/bindings/net/can/ X: Documentation/devicetree/bindings/net/wireless/ +X: drivers/net/can/ X: drivers/net/wireless/ NETWORKING DRIVERS (WIRELESS) @@ -16157,6 +16159,7 @@ X: include/net/mac80211.h X: include/net/wext.h X: net/9p/ X: net/bluetooth/ +X: net/can/ X: net/mac80211/ X: net/rfkill/ X: net/wireless/ From c69c5e10adb903ae2438d4f9c16eccf43d1fcbc1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 18 Nov 2024 03:15:17 -0800 Subject: [PATCH 13/16] netpoll: Use rcu_access_pointer() in __netpoll_setup The ndev->npinfo pointer in __netpoll_setup() is RCU-protected but is being accessed directly for a NULL check. While no RCU read lock is held in this context, we should still use proper RCU primitives for consistency and correctness. Replace the direct NULL check with rcu_access_pointer(), which is the appropriate primitive when only checking for NULL without dereferencing the pointer. This function provides the necessary ordering guarantees without requiring RCU read-side protection. Reviewed-by: Michal Kubiak Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20241118-netpoll_rcu-v1-1-a1888dcb4a02@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index aa49b92e9194..45fb60bc4803 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -626,7 +626,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!ndev->npinfo) { + if (!rcu_access_pointer(ndev->npinfo)) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; From a57d5a72f8dec7db8a79d0016fb0a3bdecc82b56 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 18 Nov 2024 03:15:18 -0800 Subject: [PATCH 14/16] netpoll: Use rcu_access_pointer() in netpoll_poll_lock The ndev->npinfo pointer in netpoll_poll_lock() is RCU-protected but is being accessed directly for a NULL check. While no RCU read lock is held in this context, we should still use proper RCU primitives for consistency and correctness. Replace the direct NULL check with rcu_access_pointer(), which is the appropriate primitive when only checking for NULL without dereferencing the pointer. This function provides the necessary ordering guarantees without requiring RCU read-side protection. Fixes: bea3348eef27 ("[NET]: Make NAPI polling independent of struct net_device objects.") Signed-off-by: Breno Leitao Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20241118-netpoll_rcu-v1-2-a1888dcb4a02@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index cd4e28db0cbd..959a4daacea1 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -72,7 +72,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi) { struct net_device *dev = napi->dev; - if (dev && dev->npinfo) { + if (dev && rcu_access_pointer(dev->npinfo)) { int owner = smp_processor_id(); while (cmpxchg(&napi->poll_owner, -1, owner) != -1) From 8ca2a1eeadf09862190b2810697702d803ceef2d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 18 Nov 2024 11:09:09 +0800 Subject: [PATCH 15/16] bpf: fix recursive lock when verdict program return SK_PASS When the stream_verdict program returns SK_PASS, it places the received skb into its own receive queue, but a recursive lock eventually occurs, leading to an operating system deadlock. This issue has been present since v6.9. ''' sk_psock_strp_data_ready write_lock_bh(&sk->sk_callback_lock) strp_data_ready strp_read_sock read_sock -> tcp_read_sock strp_recv cb.rcv_msg -> sk_psock_strp_read # now stream_verdict return SK_PASS without peer sock assign __SK_PASS = sk_psock_map_verd(SK_PASS, NULL) sk_psock_verdict_apply sk_psock_skb_ingress_self sk_psock_skb_ingress_enqueue sk_psock_data_ready read_lock_bh(&sk->sk_callback_lock) <= dead lock ''' This topic has been discussed before, but it has not been fixed. Previous discussion: https://lore.kernel.org/all/6684a5864ec86_403d20898@john.notmuch Fixes: 6648e613226e ("bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue") Reported-by: Vincent Whitchurch Signed-off-by: Jiayuan Chen Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20241118030910.36230-2-mrpre@163.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index b1dcbd3be89e..e90fbab703b2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1117,9 +1117,9 @@ static void sk_psock_strp_data_ready(struct sock *sk) if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); From 0c4d5cb9a1c3583b61df199a51eccebe759e3c18 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 18 Nov 2024 11:09:10 +0800 Subject: [PATCH 16/16] selftests/bpf: Add some tests with sockmap SK_PASS Add a new tests in sockmap_basic.c to test SK_PASS for sockmap Signed-off-by: Jiayuan Chen Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20241118030910.36230-3-mrpre@163.com Signed-off-by: Jakub Kicinski --- .../selftests/bpf/prog_tests/sockmap_basic.c | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 82bfb266741c..a2041f8e32eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -501,6 +501,58 @@ static void test_sockmap_skb_verdict_shutdown(void) test_sockmap_pass_prog__destroy(skel); } +static void test_sockmap_stream_pass(void) +{ + int zero = 0, sent, recvd; + int verdict, parser; + int err, map; + int c = -1, p = -1; + struct test_sockmap_pass_prog *pass = NULL; + char snd[256] = "0123456789"; + char rcv[256] = "0"; + + pass = test_sockmap_pass_prog__open_and_load(); + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + parser = bpf_program__fd(pass->progs.prog_skb_parser); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream parser")) + goto out; + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto out; + + err = create_pair(AF_INET, SOCK_STREAM, &c, &p); + if (err) + goto out; + + /* sk_data_ready of 'p' will be replaced by strparser handler */ + err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p)")) + goto out_close; + + /* + * as 'prog_skb_parser' return the original skb len and + * 'prog_skb_verdict' return SK_PASS, the kernel will just + * pass it through to original socket 'p' + */ + sent = xsend(c, snd, sizeof(snd), 0); + ASSERT_EQ(sent, sizeof(snd), "xsend(c)"); + + recvd = recv_timeout(p, rcv, sizeof(rcv), SOCK_NONBLOCK, + IO_TIMEOUT_SEC); + ASSERT_EQ(recvd, sizeof(rcv), "recv_timeout(p)"); + +out_close: + close(c); + close(p); + +out: + test_sockmap_pass_prog__destroy(pass); +} + static void test_sockmap_skb_verdict_fionread(bool pass_prog) { int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -923,6 +975,8 @@ void test_sockmap_basic(void) test_sockmap_progs_query(BPF_SK_SKB_VERDICT); if (test__start_subtest("sockmap skb_verdict shutdown")) test_sockmap_skb_verdict_shutdown(); + if (test__start_subtest("sockmap stream parser and verdict pass")) + test_sockmap_stream_pass(); if (test__start_subtest("sockmap skb_verdict fionread")) test_sockmap_skb_verdict_fionread(true); if (test__start_subtest("sockmap skb_verdict fionread on drop"))