From cfd51fcf117157df8fb1ff0552d2fac909bdfc3d Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Mon, 29 Sep 2025 02:10:56 -0700 Subject: [PATCH 01/65] RDMA/cxgb4: fix typo in write_pbl() debug message Correct the debug log format string from "pdb_addr" -> "pbl_addr" to match the actual variable name. Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250929091102.50384-1-alok.a.tiwari@oracle.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/cxgb4/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index dcdfe250bdbe..adeed7447e7b 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, { int err; - pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", pbl_addr, rdev->lldi.vr->pbl.start, pbl_size); From 879424832d241001e288cc741ed8a9253387a34a Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 8 Oct 2025 18:59:13 +0200 Subject: [PATCH 02/65] RDMA/core: let rdma_connect_locked() call lockdep_assert_held(&id_priv->handler_mutex) rdma_accept() also has this, so this is now more consistent and may prevent bugs in future. Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Stefan Metzmacher Link: https://patch.msgid.link/20251008165913.444276-1-metze@samba.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5b2d3ae3f9fc..95e89f5c147c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4475,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id, container_of(id, struct rdma_id_private, id); int ret; + lockdep_assert_held(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; From 1511efaca032ed209c04b2af67915a613673865c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Oct 2025 13:03:43 +0100 Subject: [PATCH 03/65] RDMA/rxe: Remove redundant assignment to variable page_offset The variable page_offset is being assigned a value at the start of a loop and being redundantly zero'd at the end of the loop, there is no code that reads the zero'd value. The assignment is redundant and can be removed. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20251014120343.2528608-1-coking@nvidia.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 1 - drivers/infiniband/sw/rxe/rxe_odp.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index bcb97b3ea58a..b1df05238848 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -452,7 +452,6 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng length -= bytes; iova += bytes; - page_offset = 0; } return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index f58e3ec6252f..ae71812bea82 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -358,7 +358,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, length -= bytes; iova += bytes; - page_offset = 0; } mutex_unlock(&umem_odp->umem_mutex); From be180c847a6db6646d7bb4740a1d73f6f67d1030 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 19 Oct 2025 20:43:20 -0700 Subject: [PATCH 04/65] RDMA/uverbs: fix some kernel-doc warnings Fix 49 kernel-doc warnings in ib_verbs.h: - Add struct short description for rdma_stat_desc, rdma_hw_stats. - Fix kernel-doc format for struct members (use ':' instead of '-') for several structs. - Don't use "/**" kernel-doc notation for struct members in ib_device_ops (most members are not documented and most of the kernel-doc was not formatted correctly). - Spell function parameters correctly in ib_dma_map_sgtable_attrs(), ib_device_try_get(), rdma_roce_rescan_device(). - Add kernel-doc for the function parameter in rdma_flow_label_to_udp_sport(). Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251020034320.3011094-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 99 +++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6139223e92e4..0a85af610b6b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -586,10 +586,10 @@ enum ib_stat_flag { }; /** - * struct rdma_stat_desc - * @name - The name of the counter - * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL - * @priv - Driver private information; Core code should not use + * struct rdma_stat_desc - description of one rdma stat/counter + * @name: The name of the counter + * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL + * @priv: Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; @@ -598,24 +598,24 @@ struct rdma_stat_desc { }; /** - * struct rdma_hw_stats - * @lock - Mutex to protect parallel write access to lifespan and values + * struct rdma_hw_stats - collection of hardware stats and their management + * @lock: Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. - * @timestamp - Used by the core code to track when the last update was - * @lifespan - Used by the core code to determine how old the counters + * @timestamp: Used by the core code to track when the last update was + * @lifespan: Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. - * @descs - Array of pointers to static descriptors used for the counters + * @descs: Array of pointers to static descriptors used for the counters * in directory. - * @is_disabled - A bitmap to indicate each counter is currently disabled + * @is_disabled: A bitmap to indicate each counter is currently disabled * or not. - * @num_counters - How many hardware counters there are. If name is + * @num_counters: How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. - * @value - Array of u64 counters that are accessed by the sysfs code and + * @value: Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { @@ -2405,7 +2405,7 @@ struct ib_device_ops { int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); - /** + /* * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device @@ -2415,7 +2415,7 @@ struct ib_device_ops { struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); - /** + /* * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold @@ -2425,7 +2425,7 @@ struct ib_device_ops { */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); - /** + /* * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params @@ -2439,14 +2439,14 @@ struct ib_device_ops { int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); - /** + /* * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); - /** + /* * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is @@ -2460,7 +2460,7 @@ struct ib_device_ops { * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); - /** + /* * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. @@ -2475,7 +2475,7 @@ struct ib_device_ops { struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); - /** + /* * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. @@ -2516,12 +2516,12 @@ struct ib_device_ops { int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); - /** + /* * pre_destroy_cq - Prevent a cq from generating any new work * completions, but not free any kernel resources */ int (*pre_destroy_cq)(struct ib_cq *cq); - /** + /* * post_destroy_cq - Free all kernel resources */ void (*post_destroy_cq)(struct ib_cq *cq); @@ -2615,7 +2615,7 @@ struct ib_device_ops { struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); - /** + /* * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the @@ -2624,7 +2624,7 @@ struct ib_device_ops { struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); - /** + /* * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated @@ -2639,14 +2639,14 @@ struct ib_device_ops { int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); - /** + /* * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); - /** + /* * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); @@ -2682,39 +2682,39 @@ struct ib_device_ops { u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); - /** + /* * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, u32 port); - /** + /* * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); - /** + /* * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); - /** + /* * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); - /** + /* * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); - /** + /* * counter_init - Initialize the driver specific rdma counter struct. */ void (*counter_init)(struct rdma_counter *counter); - /** + /* * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ @@ -2730,25 +2730,25 @@ struct ib_device_ops { */ int (*get_numa_node)(struct ib_device *dev); - /** + /* * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); - /** + /* * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); - /** + /* * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); - /** + /* * report_port_event - Drivers need to implement this if they have * some private stuff to handle when link status changes. */ @@ -3157,8 +3157,8 @@ static inline u32 rdma_start_port(const struct ib_device *device) /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device - * @device - The struct ib_device * to iterate over - * @iter - The unsigned int to store the port number + * @device: The struct ib_device * to iterate over + * @iter: The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ @@ -3524,7 +3524,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device, /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device - * @port_num: Port number + * @port: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return @@ -3542,7 +3542,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device - * @port_num: Port number + * @port: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. @@ -3919,7 +3919,7 @@ static inline int ib_destroy_qp(struct ib_qp *qp) /** * ib_open_qp - Obtain a reference to an existing sharable QP. - * @xrcd - XRC domain + * @xrcd: XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. @@ -4273,9 +4273,9 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created - * @sg: The sg_table object describing the buffer + * @sgt: The sg_table object describing the buffer * @direction: The direction of the DMA - * @attrs: Optional DMA attributes for the map operation + * @dma_attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, @@ -4419,8 +4419,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. - * @mr - struct ib_mr pointer to be updated. - * @newkey - new key to be used. + * @mr: struct ib_mr pointer to be updated. + * @newkey: new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { @@ -4431,7 +4431,7 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. - * @rkey - the rkey to increment. + * @rkey: the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { @@ -4525,7 +4525,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, /** * ib_device_try_get: Hold a registration lock - * device: The device to lock + * @dev: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully @@ -4832,7 +4832,7 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * - * @device: the rdma device + * @ibdev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); @@ -4885,7 +4885,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) /** * ibdev_to_node - return the NUMA node for a given ib_device - * @dev: device to get the NUMA node for. + * @ibdev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { @@ -4923,6 +4923,7 @@ static inline struct net *rdma_dev_net(struct ib_device *device) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label + * @fl: flow_label value * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same From 58aca1f3de059c19262550fab6dba150f9d5f989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Tue, 21 Oct 2025 15:27:33 +0200 Subject: [PATCH 05/65] RDMA/cm: Base cm_id destruction timeout on CMA values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a GSI MAD packet is sent on the QP, it will potentially be retried CMA_MAX_CM_RETRIES times with a timeout value of: 4.096usec * 2 ^ CMA_CM_RESPONSE_TIMEOUT The above equates to ~64 seconds using the default CMA values. The cm_id_priv's refcount will be incremented for this period. Therefore, the timeout value waiting for a cm_id destruction must be based on the effective timeout of MAD packets. To provide additional leeway, we add 25% to this timeout and use that instead of the constant 10 seconds timeout, which may result in false negatives. Signed-off-by: Håkon Bugge Link: https://patch.msgid.link/20251021132738.4179604-1-haakon.bugge@oracle.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 01bede8ba105..2a36a9345959 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -34,7 +34,6 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); -#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ #define CM_DIRECT_RETRY_CTX ((void *) 1UL) #define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ @@ -1057,6 +1056,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; enum ib_cm_state old_state; + unsigned long timeout; struct cm_work *work; int ret; @@ -1167,10 +1167,9 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); + timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4); do { - ret = wait_for_completion_timeout(&cm_id_priv->comp, - msecs_to_jiffies( - CM_DESTROY_ID_WAIT_TIMEOUT)); + ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout); if (!ret) /* timeout happened */ cm_destroy_id_wait_timeout(cm_id, old_state); } while (!ret); From 503a5e4690ae14c18570141bc0dcf7501a8419b0 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 27 Oct 2025 14:52:03 -0700 Subject: [PATCH 06/65] RDMA/rxe: Fix null deref on srq->rq.queue after resize failure A NULL pointer dereference can occur in rxe_srq_chk_attr() when ibv_modify_srq() is invoked twice in succession under certain error conditions. The first call may fail in rxe_queue_resize(), which leads rxe_srq_from_attr() to set srq->rq.queue = NULL. The second call then triggers a crash (null deref) when accessing srq->rq.queue->buf->index_mask. Call Trace: rxe_modify_srq+0x170/0x480 [rdma_rxe] ? __pfx_rxe_modify_srq+0x10/0x10 [rdma_rxe] ? uverbs_try_lock_object+0x4f/0xa0 [ib_uverbs] ? rdma_lookup_get_uobject+0x1f0/0x380 [ib_uverbs] ib_uverbs_modify_srq+0x204/0x290 [ib_uverbs] ? __pfx_ib_uverbs_modify_srq+0x10/0x10 [ib_uverbs] ? tryinc_node_nr_active+0xe6/0x150 ? uverbs_fill_udata+0xed/0x4f0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x2c0/0x470 [ib_uverbs] ? __pfx_ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x10/0x10 [ib_uverbs] ? uverbs_fill_udata+0xed/0x4f0 [ib_uverbs] ib_uverbs_run_method+0x55a/0x6e0 [ib_uverbs] ? __pfx_ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x10/0x10 [ib_uverbs] ib_uverbs_cmd_verbs+0x54d/0x800 [ib_uverbs] ? __pfx_ib_uverbs_cmd_verbs+0x10/0x10 [ib_uverbs] ? __pfx___raw_spin_lock_irqsave+0x10/0x10 ? __pfx_do_vfs_ioctl+0x10/0x10 ? ioctl_has_perm.constprop.0.isra.0+0x2c7/0x4c0 ? __pfx_ioctl_has_perm.constprop.0.isra.0+0x10/0x10 ib_uverbs_ioctl+0x13e/0x220 [ib_uverbs] ? __pfx_ib_uverbs_ioctl+0x10/0x10 [ib_uverbs] __x64_sys_ioctl+0x138/0x1c0 do_syscall_64+0x82/0x250 ? fdget_pos+0x58/0x4c0 ? ksys_write+0xf3/0x1c0 ? __pfx_ksys_write+0x10/0x10 ? do_syscall_64+0xc8/0x250 ? __pfx_vm_mmap_pgoff+0x10/0x10 ? fget+0x173/0x230 ? fput+0x2a/0x80 ? ksys_mmap_pgoff+0x224/0x4c0 ? do_syscall_64+0xc8/0x250 ? do_user_addr_fault+0x37b/0xfe0 ? clear_bhb_loop+0x50/0xa0 ? clear_bhb_loop+0x50/0xa0 ? clear_bhb_loop+0x50/0xa0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 8700e3e7c485 ("Soft RoCE driver") Tested-by: Liu Yi Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20251027215203.1321-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_srq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 3661cb627d28..2a234f26ac10 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -171,7 +171,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, udata, mi, &srq->rq.producer_lock, &srq->rq.consumer_lock); if (err) - goto err_free; + return err; srq->rq.max_wr = attr->max_wr; } @@ -180,11 +180,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, srq->limit = attr->srq_limit; return 0; - -err_free: - rxe_queue_cleanup(q); - srq->rq.queue = NULL; - return err; } void rxe_srq_cleanup(struct rxe_pool_elem *elem) From 69e8e429bca2b21626097229d38e71487e895cf6 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Thu, 30 Oct 2025 21:17:22 -0500 Subject: [PATCH 07/65] RDMA/irdma: Enforce local fence for LOCAL_INV WRs Enforce local fence for LOCAL_INV WRs to avoid spurious FASTREG_VALID_MKEY async events during heavy invalidation/registration activity. Signed-off-by: Jacob Moroni Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251031021726.1003-3-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 76ce6137f2ba..1f37437f72ae 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4077,7 +4077,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; - info.local_fence = info.read_fence; + info.local_fence = true; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; From 153243086eef132dc84b7b4e23a755fb1c41889d Mon Sep 17 00:00:00 2001 From: Jay Bhat Date: Thu, 30 Oct 2025 21:17:21 -0500 Subject: [PATCH 08/65] RDMA/irdma: Initialize cqp_cmds_info to prevent resource leaks Failure to initialize info.create field to false in certain cases was resulting in incorrect status code going to rdma-core when dereg_mr failed during reset. To fix this, memset entire cqp_request->info in irdma_alloc_and_get_cqp_request() function, so that this is not spread all over the code. Signed-off-by: Bhat, Jay Reviewed-by: Jacob Moroni Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251031021726.1003-2-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/hw.c | 3 --- drivers/infiniband/hw/irdma/utils.c | 6 +----- drivers/infiniband/hw/irdma/verbs.c | 4 ---- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 7bad0e38786a..d1fc5726b979 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; - memset(info, 0, sizeof(*info)); info->add = add_port; info->port = accel_local_port; cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY; @@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf, if (action == IRDMA_ARP_ADD) { cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY; info = &cqp_info->in.u.add_arp_cache_entry.info; - memset(info, 0, sizeof(*info)); info->arp_index = (u16)arp_index; info->permanent = true; ether_addr_copy(info->mac_addr, mac_addr); @@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_qhash_table_entry.info; - memset(info, 0, sizeof(*info)); info->vsi = &iwdev->vsi; info->manage = mtype; info->entry_type = etype; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8b94d87b0192..3b19e2b997b0 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp, cqp_request->waiting = wait; refcount_set(&cqp_request->refcnt, 1); memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info)); + memset(&cqp_request->info, 0, sizeof(cqp_request->info)); return cqp_request; } @@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS; cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE; @@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY; cqp_info->post_sq = 1; cqp_info->in.u.qp_destroy.qp = qp; @@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER; cqp_info->post_sq = 1; cqp_info->in.u.stats_gather.info = pestat->gather_info; @@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.stats_manage.info = *stats_info; @@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.ws_node.info = *node_info; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 1f37437f72ae..9804deba513d 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -771,7 +771,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->mac_valid = true; qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE; @@ -3102,7 +3101,6 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.alloc_stag.info; - memset(info, 0, sizeof(*info)); info->page_size = PAGE_SIZE; info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; info->pd_id = iwpd->sc_pd.pd_id; @@ -3252,7 +3250,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, cqp_info = &cqp_request->info; stag_info = &cqp_info->in.u.mr_reg_non_shared.info; - memset(stag_info, 0, sizeof(*stag_info)); stag_info->va = iwpbl->user_base; stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; stag_info->stag_key = (u8)iwmr->stag; @@ -3646,7 +3643,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) cqp_info = &cqp_request->info; info = &cqp_info->in.u.dealloc_stag.info; - memset(info, 0, sizeof(*info)); info->pd_id = iwpd->sc_pd.pd_id; info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S; info->mr = true; From cd84d8001e5446f6d1a4eca15a502188207523b4 Mon Sep 17 00:00:00 2001 From: Jay Bhat Date: Thu, 30 Oct 2025 21:17:24 -0500 Subject: [PATCH 09/65] RDMA/irdma: Silently consume unsignaled completions In case we get an unsignaled error completion, we silently consume the CQE by pretending the QP does not exist. Without this, bookkeeping for signaled completions does not work correctly. Signed-off-by: Jay Bhat Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251031021726.1003-5-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 5 +++++ drivers/infiniband/hw/irdma/user.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index ce1ae10c30fc..b3c6cbde797c 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -194,6 +194,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id; qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; qp->sq_wrtrk_array[*wqe_idx].quanta = quanta; + qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled; return wqe; } @@ -1355,6 +1356,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid; if (!info->comp_status) info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len; + if (!qp->sq_wrtrk_array[wqe_idx].signaled) { + ret_code = -EFAULT; + goto exit; + } info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); IRDMA_RING_SET_TAIL(qp->sq_ring, wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta); diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ab57f689827a..ee02c67ad313 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -482,7 +482,8 @@ struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; u16 quanta; - u8 reserved[2]; + u8 signaled; + u8 reserved[1]; }; struct irdma_qp_quanta { From 0a192745551cd2e999532075f21c0cfa174bc065 Mon Sep 17 00:00:00 2001 From: Jay Bhat Date: Thu, 30 Oct 2025 21:17:23 -0500 Subject: [PATCH 10/65] RDMA/irdma: CQ size and shadow update changes for GEN3 CQ shadow area should not be updated at the end of a page (once every 64th CQ entry), except when CQ has no more CQEs. SW must also increase the requested CQ size by 1 and make sure the CQ is not exactly one page in size. This is to address a quirk in the hardware. Signed-off-by: Jay Bhat Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251031021726.1003-4-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.h | 1 - drivers/infiniband/hw/irdma/uk.c | 26 +++++++++++++-- drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/utils.c | 52 ++++++++++------------------- drivers/infiniband/hw/irdma/verbs.c | 16 +++++++-- 5 files changed, 56 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 886b30da188a..f22b1ee20fcc 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, void (*callback_fcn)(struct irdma_cqp_request *cqp_request), void *cb_param); void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request); -bool irdma_cq_empty(struct irdma_cq *iwcq); int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event, void *ptr); int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index b3c6cbde797c..a006e7365f4d 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1137,6 +1137,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, writel(cq->cq_id, cq->cqe_alloc_db); } +/** + * irdma_uk_cq_empty - Check if CQ is empty + * @cq: hw cq + */ +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq) +{ + __le64 *cqe; + u8 polarity; + u64 qword3; + + if (cq->avoid_mem_cflct) + cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq); + else + cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq); + + get_64bit_val(cqe, 24, &qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + + return polarity != cq->polarity; +} + /** * irdma_uk_cq_poll_cmpl - get cq completion info * @cq: hw cq @@ -1425,8 +1446,9 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, IRDMA_RING_MOVE_TAIL(cq->cq_ring); if (!cq->avoid_mem_cflct && ext_valid) IRDMA_RING_MOVE_TAIL(cq->cq_ring); - set_64bit_val(cq->shadow_area, 0, - IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); + if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq)) + set_64bit_val(cq->shadow_area, 0, + IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); } else { qword3 &= ~IRDMA_CQ_WQEIDX; qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail); diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ee02c67ad313..6c29fa04e821 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops { struct irdma_bind_window *op_info); }; +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq); int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info); void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 3b19e2b997b0..cc2a12f735d3 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2353,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event) iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context); } -bool irdma_cq_empty(struct irdma_cq *iwcq) -{ - struct irdma_cq_uk *ukcq; - u64 qword3; - __le64 *cqe; - u8 polarity; - - ukcq = &iwcq->sc_cq.cq_uk; - if (ukcq->avoid_mem_cflct) - cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq); - else - cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); - get_64bit_val(cqe, 24, &qword3); - polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); - - return polarity != ukcq->polarity; -} - void irdma_remove_cmpls_list(struct irdma_cq *iwcq) { struct irdma_cmpl_gen *cmpl_node; @@ -2432,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk; struct irdma_ring *sq_ring = &qp->sq_ring; struct irdma_ring *rq_ring = &qp->rq_ring; + struct irdma_cq *iwscq = iwqp->iwscq; + struct irdma_cq *iwrcq = iwqp->iwrcq; struct irdma_cmpl_gen *cmpl; __le64 *sw_wqe; u64 wqe_qword; @@ -2439,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) bool compl_generated = false; unsigned long flags1; - spin_lock_irqsave(&iwqp->iwscq->lock, flags1); - if (irdma_cq_empty(iwqp->iwscq)) { + spin_lock_irqsave(&iwscq->lock, flags1); + if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2448,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); return; } @@ -2467,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) kfree(cmpl); continue; } - ibdev_dbg(iwqp->iwscq->ibcq.device, + ibdev_dbg(iwscq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id); - list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated); + list_add_tail(&cmpl->list, &iwscq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwscq); + irdma_comp_handler(iwscq); } else { - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } - spin_lock_irqsave(&iwqp->iwrcq->lock, flags1); - if (irdma_cq_empty(iwqp->iwrcq)) { + spin_lock_irqsave(&iwrcq->lock, flags1); + if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2492,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); return; } @@ -2504,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ; /* remove the RQ WR by moving RQ tail */ IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); - ibdev_dbg(iwqp->iwrcq->ibcq.device, + ibdev_dbg(iwrcq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id, wqe_idx); - list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated); + list_add_tail(&cmpl->list, &iwrcq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwrcq); + irdma_comp_handler(iwrcq); } else { - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 9804deba513d..7eff07a329e7 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2028,6 +2028,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, struct irdma_pci_f *rf; struct irdma_cq_buf *cq_buf = NULL; unsigned long flags; + u8 cqe_size; int ret; iwdev = to_iwdev(ibcq->device); @@ -2044,7 +2045,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, return -EINVAL; if (!iwcq->user_mode) { - entries++; + entries += 2; if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) @@ -2052,6 +2053,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, if (entries & 1) entries += 1; /* cq size must be an even number */ + + cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32; + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; } info.cq_size = max(entries, 4); @@ -2482,6 +2487,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, int err_code; int entries = attr->cqe; bool cqe_64byte_ena; + u8 cqe_size; err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev); if (err_code) @@ -2507,6 +2513,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, ukinfo->cq_id = cq_num; cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false; + cqe_size = cqe_64byte_ena ? 64 : 32; ukinfo->avoid_mem_cflct = cqe_64byte_ena; iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size; if (attr->comp_vector < rf->ceqs_count) @@ -2579,13 +2586,16 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_free_rsrc; } - entries++; + entries += 2; if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) entries *= 2; if (entries & 1) entries += 1; /* cq size must be an even number */ + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; + ukinfo->cq_size = entries; if (cqe_64byte_ena) @@ -4500,7 +4510,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, } if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated))) + (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated))) ret = 1; spin_unlock_irqrestore(&iwcq->lock, flags); From da58d4223b1690dd47652aa4b5227f39ab139b76 Mon Sep 17 00:00:00 2001 From: Jay Bhat Date: Thu, 30 Oct 2025 21:17:26 -0500 Subject: [PATCH 11/65] RDMA/irdma: Take a lock before moving SRQ tail in poll_cq Need to take an SRQ lock in poll_cq before moving SRQ tail. Signed-off-by: Jay Bhat Reviewed-by: Jacob Moroni Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251031021726.1003-7-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 5 +++++ drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/verbs.c | 1 + 3 files changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index a006e7365f4d..8a463b3d4c83 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1309,6 +1309,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) { + unsigned long flags; + srq = qp->srq_uk; get_64bit_val(cqe, 8, &info->wr_id); @@ -1321,8 +1323,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } else { info->stag_invalid_set = false; } + spin_lock_irqsave(srq->lock, flags); IRDMA_RING_MOVE_TAIL(srq->srq_ring); + spin_unlock_irqrestore(srq->lock, flags); pring = &srq->srq_ring; + } else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) { u32 array_idx; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 6c29fa04e821..3a27cce3c3db 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -466,6 +466,7 @@ struct irdma_srq_uk { u8 wqe_size; u8 wqe_size_multiplier; u8 deferred_flag; + spinlock_t *lock; }; struct irdma_srq_uk_init_info { diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 7eff07a329e7..66ecaa0f5be6 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2388,6 +2388,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq, info.vsi = &iwdev->vsi; info.pd = &iwpd->sc_pd; + iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock; err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info); if (err_code) goto free_dmem; From f673fb3449fcd8afdd7f67277217a93b2fcba435 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 1 Nov 2025 17:31:11 +0100 Subject: [PATCH 12/65] RDMA/core: RDMA/mlx5: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251101163121.78400-2-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ucma.c | 2 +- drivers/infiniband/hw/mlx5/odp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index f86ece701db6..ec3be65a2b88 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -366,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { xa_lock(&ctx_table); if (xa_load(&ctx_table, ctx->id) == ctx) - queue_work(system_unbound_wq, &ctx->close_work); + queue_work(system_dfl_wq, &ctx->close_work); xa_unlock(&ctx_table); } return 0; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 0e8ae85af5a6..6441abdf1f3b 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -265,7 +265,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); - queue_work(system_unbound_wq, &mr->odp_destroy.work); + queue_work(system_dfl_wq, &mr->odp_destroy.work); } static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, @@ -2093,6 +2093,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, destroy_prefetch_work(work); return rc; } - queue_work(system_unbound_wq, &work->work); + queue_work(system_dfl_wq, &work->work); return 0; } From e60c5583b661da65b09bfd6ae91126607397490e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 1 Nov 2025 17:31:12 +0100 Subject: [PATCH 13/65] RDMA/core: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251101163121.78400-3-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/device.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2a36a9345959..024df6ee239d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4517,7 +4517,7 @@ static int __init ib_cm_init(void) get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); - cm.wq = alloc_workqueue("ib_cm", 0, 1); + cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b4f3c835844a..13e8a1714bbd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -3021,7 +3021,7 @@ static int __init ib_core_init(void) { int ret = -ENOMEM; - ib_wq = alloc_workqueue("infiniband", 0, 0); + ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); if (!ib_wq) return -ENOMEM; @@ -3031,7 +3031,7 @@ static int __init ib_core_init(void) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); if (!ib_comp_wq) goto err_unbound; From 5f93287fa9d0db9bad0251e526dead4aed448288 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 1 Nov 2025 17:31:13 +0100 Subject: [PATCH 14/65] hfi1: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. CC: Dennis Dalessandro Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251101163121.78400-4-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/init.c | 4 ++-- drivers/infiniband/hw/hfi1/opfn.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index b35f92e7d865..e4aef102dac0 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd) ppd->hfi1_wq = alloc_workqueue( "hfi%d_%d", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, dd->unit, pidx); if (!ppd->hfi1_wq) diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 370a5a8eaa71..6e0e3458d202 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) int opfn_init(void) { opfn_wq = alloc_workqueue("hfi_opfn", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); if (!opfn_wq) return -ENOMEM; From 5267feda50680c73e33b118dbebfb961d6a864bd Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 1 Nov 2025 17:31:14 +0100 Subject: [PATCH 15/65] RDMA/mlx4: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. CC: Yishai Hadas Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251101163121.78400-5-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 12b481d138cf..03aacd526860 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) int mlx4_ib_cm_init(void) { - cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0); + cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0); if (!cm_wq) return -ENOMEM; From 7196156b0ce3dc4cdbda5c09897708b2e1081de1 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 1 Nov 2025 17:31:15 +0100 Subject: [PATCH 16/65] IB/rdmavt: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. CC: Dennis Dalessandro Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251101163121.78400-6-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/cq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0ca2743f1075..e7835ca70e2b 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -518,7 +518,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) */ int rvt_driver_cq_init(void) { - comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE, + comp_vector_wq = alloc_workqueue("%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_PERCPU, 0, "rdmavt_cq"); if (!comp_vector_wq) return -ENOMEM; From 512c83265796d613f21255c766839eaed1c1cc79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Nov 2025 20:51:27 -0800 Subject: [PATCH 17/65] IB/rdmavt: rdmavt_qp.h: clean up kernel-doc comments Correct the kernel-doc comments format to avoid around 35 kernel-doc warnings: - use struct keyword to introduce struct kernel-doc comments - use correct variable name for some struct members - use correct function name in comments for some functions - fix spelling in a few comments - use a ':' instead of '-' to separate struct members from their descriptions - add a function name heading for rvt_div_mtu() This leaves one struct member that is not described: rdmavt_qp.h:206: warning: Function parameter or struct member 'wq' not described in 'rvt_krwq' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251105045127.106822-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/rdmavt_qp.h | 70 +++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d67892944193..71140ea0aeb2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -144,7 +144,7 @@ #define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) /** - * rvt_ud_wr - IB UD work plus AH cache + * struct rvt_ud_wr - IB UD work plus AH cache * @wr: valid IB work request * @attr: pointer to an allocated AH attribute * @@ -184,10 +184,10 @@ struct rvt_swqe { * struct rvt_krwq - kernel struct receive work request * @p_lock: lock to protect producer of the kernel buffer * @head: index of next entry to fill - * @c_lock:lock to protect consumer of the kernel buffer + * @c_lock: lock to protect consumer of the kernel buffer * @tail: index of next entry to pull - * @count: count is aproximate of total receive enteries posted - * @rvt_rwqe: struct of receive work request queue entry + * @count: count is approximate of total receive entries posted + * @curr_wq: struct of receive work request queue entry * * This structure is used to contain the head pointer, * tail pointer and receive work queue entries for kernel @@ -309,10 +309,10 @@ struct rvt_ack_entry { #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) /** - * rvt_operation_params - op table entry - * @length - the length to copy into the swqe entry - * @qpt_support - a bit mask indicating QP type support - * @flags - RVT_OPERATION flags (see above) + * struct rvt_operation_params - op table entry + * @length: the length to copy into the swqe entry + * @qpt_support: a bit mask indicating QP type support + * @flags: RVT_OPERATION flags (see above) * * This supports table driven post send so that * the driver can have differing an potentially @@ -552,7 +552,7 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) /** * rvt_is_user_qp - return if this is user mode QP - * @qp - the target QP + * @qp: the target QP */ static inline bool rvt_is_user_qp(struct rvt_qp *qp) { @@ -561,7 +561,7 @@ static inline bool rvt_is_user_qp(struct rvt_qp *qp) /** * rvt_get_qp - get a QP reference - * @qp - the QP to hold + * @qp: the QP to hold */ static inline void rvt_get_qp(struct rvt_qp *qp) { @@ -570,7 +570,7 @@ static inline void rvt_get_qp(struct rvt_qp *qp) /** * rvt_put_qp - release a QP reference - * @qp - the QP to release + * @qp: the QP to release */ static inline void rvt_put_qp(struct rvt_qp *qp) { @@ -580,7 +580,7 @@ static inline void rvt_put_qp(struct rvt_qp *qp) /** * rvt_put_swqe - drop mr refs held by swqe - * @wqe - the send wqe + * @wqe: the send wqe * * This drops any mr references held by the swqe */ @@ -597,8 +597,8 @@ static inline void rvt_put_swqe(struct rvt_swqe *wqe) /** * rvt_qp_wqe_reserve - reserve operation - * @qp - the rvt qp - * @wqe - the send wqe + * @qp: the rvt qp + * @wqe: the send wqe * * This routine used in post send to record * a wqe relative reserved operation use. @@ -612,8 +612,8 @@ static inline void rvt_qp_wqe_reserve( /** * rvt_qp_wqe_unreserve - clean reserved operation - * @qp - the rvt qp - * @flags - send wqe flags + * @qp: the rvt qp + * @flags: send wqe flags * * This decrements the reserve use count. * @@ -653,8 +653,8 @@ u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); /** * rvt_div_round_up_mtu - round up divide - * @qp - the qp pair - * @len - the length + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu round up divide */ @@ -664,8 +664,9 @@ static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len) } /** - * @qp - the qp pair - * @len - the length + * rvt_div_mtu - shift-based divide + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu divide */ @@ -676,7 +677,7 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len) /** * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies - * @timeout - timeout input(0 - 31). + * @timeout: timeout input(0 - 31). * * Return a timeout value in jiffies. */ @@ -690,7 +691,8 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) /** * rvt_lookup_qpn - return the QP with the given QPN - * @ibp: the ibport + * @rdi: rvt device info structure + * @rvp: the ibport * @qpn: the QP number to look up * * The caller must hold the rcu_read_lock(), and keep the lock until @@ -716,9 +718,9 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, } /** - * rvt_mod_retry_timer - mod a retry timer - * @qp - the QP - * @shift - timeout shift to wait for multiple packets + * rvt_mod_retry_timer_ext - mod a retry timer + * @qp: the QP + * @shift: timeout shift to wait for multiple packets * Modify a potentially already running retry timer */ static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift) @@ -753,7 +755,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) } /** - * rvt_qp_sqwe_incr - increment ring index + * rvt_qp_swqe_incr - increment ring index * @qp: the qp * @val: the starting value * @@ -811,10 +813,10 @@ static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc, /** * rvt_qp_complete_swqe - insert send completion - * @qp - the qp - * @wqe - the send wqe - * @opcode - wc operation (driver dependent) - * @status - completion status + * @qp: the qp + * @wqe: the send wqe + * @opcode: wc operation (driver dependent) + * @status: completion status * * Update the s_last information, and then insert a send * completion into the completion @@ -891,7 +893,7 @@ void rvt_ruc_loopback(struct rvt_qp *qp); /** * struct rvt_qp_iter - the iterator for QPs - * @qp - the current QP + * @qp: the current QP * * This structure defines the current iterator * state for sequenced access to all QPs relative @@ -913,7 +915,7 @@ struct rvt_qp_iter { /** * ib_cq_tail - Return tail index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get tail * of cq buffer. @@ -929,7 +931,7 @@ static inline u32 ib_cq_tail(struct ib_cq *send_cq) /** * ib_cq_head - Return head index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get head * of cq buffer. @@ -945,7 +947,7 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) /** * rvt_free_rq - free memory allocated for rvt_rq struct - * @rvt_rq: request queue data structure + * @rq: request queue data structure * * This function should only be called if the rvt_mmap_info() * has not succeeded. From cf274907901115d7cec71bc89fbfac8842ee57dd Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 3 Nov 2025 10:04:25 +0530 Subject: [PATCH 18/65] RDMA/bnxt_re: Add a debugfs entry for CQE coalescing tuning This patch adds debugfs interfaces that allows the user to enable/disable the RoCE CQ coalescing and fine tune certain CQ coalescing parameters which would be helpful during debug. Signed-off-by: Damodharam Ammepalli Reviewed-by: Hongguang Gao Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20251103043425.234846-1-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 + drivers/infiniband/hw/bnxt_re/debugfs.c | 128 +++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/debugfs.h | 19 ++++ drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 +- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 6 files changed, 153 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3485e495ac6a..3a7ce4729fcf 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -224,6 +224,8 @@ struct bnxt_re_dev { struct workqueue_struct *dcb_wq; struct dentry *cc_config; struct bnxt_re_dbg_cc_config_params *cc_config_params; + struct dentry *cq_coal_cfg; + struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params; #define BNXT_VPD_FLD_LEN 32 char board_partno[BNXT_VPD_FLD_LEN]; /* RoCE mirror */ diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index be5e9b5ca2f0..d03f5bb0d890 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -23,6 +23,14 @@ static struct dentry *bnxt_re_debugfs_root; +static const char * const bnxt_re_cq_coal_str[] = { + "buf_maxtime", + "normal_maxbuf", + "during_maxbuf", + "en_ring_idle_mode", + "enable", +}; + static const char * const bnxt_re_cc_gen0_name[] = { "enable_cc", "run_avg_weight_g", @@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev) debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops); } +static ssize_t cq_coal_cfg_write(struct file *file, + const char __user *buf, + size_t count, loff_t *pos) +{ + struct seq_file *s = file->private_data; + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + char lbuf[16] = { }; + u32 val; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &val)) + return -EINVAL; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME) + return -EINVAL; + rdev->cq_coalescing.buf_maxtime = val; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.normal_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.during_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE) + return -EINVAL; + rdev->cq_coalescing.en_ring_idle_mode = val; + break; + case BNXT_RE_COAL_CQ_ENABLE: + if (val > 1) + return -EINVAL; + rdev->cq_coalescing.enable = val; + break; + default: + return -EINVAL; + } + return count; +} + +static int cq_coal_cfg_show(struct seq_file *s, void *unused) +{ + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + u32 val = 0; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + val = rdev->cq_coalescing.buf_maxtime; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + val = rdev->cq_coalescing.normal_maxbuf; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + val = rdev->cq_coalescing.during_maxbuf; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + val = rdev->cq_coalescing.en_ring_idle_mode; + break; + case BNXT_RE_COAL_CQ_ENABLE: + val = rdev->cq_coalescing.enable; + break; + default: + return -EINVAL; + } + + seq_printf(s, "%u\n", val); + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg); + +static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->cq_coal_cfg); + kfree(rdev->cq_coal_cfg_params); +} + +static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params; + int i; + + if (_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2)) + return; + + dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL); + if (!dbg_cq_coal_params) + return; + + rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root); + rdev->cq_coal_cfg_params = dbg_cq_coal_params; + + for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) { + dbg_cq_coal_params->params[i].offset = i; + dbg_cq_coal_params->params[i].rdev = rdev; + debugfs_create_file(bnxt_re_cq_coal_str[i], + 0600, rdev->cq_coal_cfg, + &dbg_cq_coal_params->params[i], + &cq_coal_cfg_fops); + } +} + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; @@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) rdev->cc_config, tmp_params, &bnxt_re_cc_config_ops); } + + bnxt_re_init_cq_coal_debugfs(rdev); } void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) { + bnxt_re_cleanup_cq_coal_debugfs(rdev); debugfs_remove_recursive(rdev->qp_debugfs); debugfs_remove_recursive(rdev->cc_config); kfree(rdev->cc_config_params); diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h index 8f101df4e838..98f4620ef245 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.h +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -33,4 +33,23 @@ struct bnxt_re_cc_param { struct bnxt_re_dbg_cc_config_params { struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; }; + +struct bnxt_re_cq_coal_param { + struct bnxt_re_dev *rdev; + u32 offset; +}; + +enum bnxt_re_cq_coal_types { + BNXT_RE_COAL_CQ_BUF_MAXTIME, + BNXT_RE_COAL_CQ_NORMAL_MAXBUF, + BNXT_RE_COAL_CQ_DURING_MAXBUF, + BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE, + BNXT_RE_COAL_CQ_ENABLE, + BNXT_RE_COAL_CQ_MAX + +}; + +struct bnxt_re_dbg_cq_coal_params { + struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX]; +}; #endif diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b13810572c2e..73003ad25ee8 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.enable = 1; rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ce90d3d834d4..c88f049136fc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); - if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) && + cq->coalescing->enable) { req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); coalescing |= ((cq->coalescing->buf_maxtime << CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b990d0c0ce1a..1b414a73b46d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param { u8 normal_maxbuf; u8 during_maxbuf; u8 en_ring_idle_mode; + u8 enable; }; #define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 From 3506242da07156e6804c061554bd01d77c1b463b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:56 +0200 Subject: [PATCH 19/65] RDMA/mlx5: Change default device for LAG slaves in RDMA TRANSPORT namespaces In case of a LAG configuration change the root namespace core device for all of the LAG slaves to be the core device of the master device for RDMA_TRANSPORT namespaces, in order to ensure all tables are created through the master device. Once the LAG is disabled revert back to the native core device. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-4-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 74 ++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index cc8859d3c2f5..bbecca405171 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports) } } +static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner, + struct mlx5_core_dev *new_owner) +{ + int ret; + + if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) || + !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) || + !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch)) + return 0; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_TX); + if (ret) + return ret; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_RX); + if (ret) { + mlx5_fs_set_root_dev(cur_owner, cur_owner, + FS_FT_RDMA_TRANSPORT_TX); + return ret; + } + + return 0; +} + +static void mlx5_ib_release_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int i, ret; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev); + WARN_ON_ONCE(ret); + } +} + +static int mlx5_ib_take_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int ret; + int i; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, dev); + if (ret) { + mlx5_ib_release_transport(dev); + return ret; + } + } + + return 0; +} + static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { @@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); + if (mlx5_lag_is_shared_fdb(dev)) { + ret = mlx5_ib_take_transport(lag_master); + if (ret) + return ret; + } + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, mlx5_core_net(lag_master)); - if (!ibdev) - return -ENOMEM; + if (!ibdev) { + ret = -ENOMEM; + goto release_transport; + } ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port), GFP_KERNEL); @@ -127,6 +192,10 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) kfree(ibdev->port); fail_port: ib_dealloc_device(&ibdev->ib_dev); +release_transport: + if (mlx5_lag_is_shared_fdb(lag_master)) + mlx5_ib_release_transport(lag_master); + return ret; } @@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) esw = peer_mdev->priv.eswitch; mlx5_eswitch_unregister_vport_reps(esw, REP_IB); } + mlx5_ib_release_transport(mdev); } __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } From 5939decc64f6b9099c2c356d75047c66a6639e00 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:57 +0200 Subject: [PATCH 20/65] RDMA/mlx5: Add other_eswitch support for devx destruction When building a devx object destruction command for steering objects add consideration for other_eswitch argument to allow proper destruction for objects that were created with it. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-5-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/devx.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8b506417ad2f..d31d7f3005c6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_table_in, in, other_vport)); MLX5_SET(destroy_flow_table_in, din, vport_number, MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, other_eswitch, + MLX5_GET(create_flow_table_in, in, other_eswitch)); + MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_table_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_table_in, din, table_type, MLX5_GET(create_flow_table_in, in, table_type)); MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); @@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_group_in, in, other_vport)); MLX5_SET(destroy_flow_group_in, din, vport_number, MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, other_eswitch, + MLX5_GET(create_flow_group_in, in, other_eswitch)); + MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_group_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_group_in, din, table_type, MLX5_GET(create_flow_group_in, in, table_type)); MLX5_SET(destroy_flow_group_in, din, table_id, @@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(set_fte_in, in, other_vport)); MLX5_SET(delete_fte_in, din, vport_number, MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, other_eswitch, + MLX5_GET(set_fte_in, in, other_eswitch)); + MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id, + MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id)); MLX5_SET(delete_fte_in, din, table_type, MLX5_GET(set_fte_in, in, table_type)); MLX5_SET(delete_fte_in, din, table_id, From f277662b734e96bf38ce7d422b091f53df3ff8cb Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:58 +0200 Subject: [PATCH 21/65] RDMA/mlx5: Refactor _get_prio() function Refactor the _get_prio() function to remove redundant arguments by reusing the existing flow table attributes struct instead of passing attributes separately. This improves code clarity and maintainability. In addition allows downstream patch to add new parameter without needing to change __get_prio() arguments. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-6-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 49 +++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index b0f7663c24c1..c8a25370aa79 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); } -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, - struct mlx5_flow_namespace *ns, +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, - int priority, - int num_entries, int num_groups, - u32 flags, u16 vport) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - ft_attr.prio = priority; - ft_attr.max_fte = num_entries; - ft_attr.flags = flags; - ft_attr.vport = vport; - ft_attr.autogroup.max_num_groups = num_groups; - ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; enum mlx5_flow_namespace_type fn_type; struct mlx5_ib_flow_prio *prio; @@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, num_entries, max_table_size); ft = prio->flow_table; - if (!ft) - return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags, 0); + if (ft) + return prio; - return prio; + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + return _get_prio(ns, prio, &ft_attr); } enum { @@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, enum mlx5_ib_optional_counter_type type) { enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; struct mlx5_flow_namespace *ns; struct mlx5_ib_flow_prio *prio; @@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, if (prio->flow_table) return 0; - prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) return PTR_ERR(prio); @@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; int priority, i, err, spec_num; struct mlx5_flow_act flow_act = {}; @@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, if (err) goto free; - prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) { err = PTR_ERR(prio); goto put_prio; @@ -1903,6 +1905,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, bool mcast, u32 ib_port) { struct mlx5_core_dev *ft_mdev = dev->mdev; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; @@ -2026,8 +2029,12 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, if (prio->flow_table) return prio; - return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags, vport); + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.vport = vport; + ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES; + return _get_prio(ns, prio, &ft_attr); } static struct mlx5_ib_flow_handler * From 6e79e210058e8f95fb3824e33b781960851ae7d1 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:59 +0200 Subject: [PATCH 22/65] RDMA/mlx5: Add other eswitch support to userspace tables Allows the creation of RDMA TRANSPORT tables over VFs/SFs that belong to another eswitch manager. Which is only possible for PFs that were connected via a create_lag PRM command. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-7-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index c8a25370aa79..d17823ce7f38 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -1874,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, u32 *flags, u16 *vport_idx, u16 *vport, struct mlx5_core_dev **ft_mdev, - u32 ib_port) + u32 ib_port, u16 *esw_owner_vhca_id) { struct mlx5_core_dev *esw_mdev; @@ -1888,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, return -EINVAL; esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); - if (esw_mdev != dev->mdev) - return -EOPNOTSUPP; + if (esw_mdev != dev->mdev) { + if (!MLX5_CAP_ADV_RDMA(dev->mdev, + rdma_transport_manager_other_eswitch)) + return -EOPNOTSUPP; + *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH; + *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id); + } *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; *ft_mdev = esw_mdev; @@ -1908,6 +1913,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; + u16 esw_owner_vhca_id = 0; int max_table_size = 0; u16 vport_idx = 0; bool esw_encap; @@ -1969,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return ERR_PTR(-EINVAL); ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, &vport_idx, &vport, - &ft_mdev, ib_port); + &ft_mdev, ib_port, + &esw_owner_vhca_id); if (ret) return ERR_PTR(ret); @@ -2033,6 +2040,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, ft_attr.max_fte = max_table_size; ft_attr.flags = flags; ft_attr.vport = vport; + ft_attr.esw_owner_vhca_id = esw_owner_vhca_id; ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES; return _get_prio(ns, prio, &ft_attr); } From 5dd68a59145e6a7dd96e07ffbf5bdc486e6cc8a2 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Wed, 5 Nov 2025 16:28:41 +0000 Subject: [PATCH 23/65] RDMA/irdma: Remove unused CQ registry The CQ registry was never actually used (ceq->reg_cq was always NULL), so remove the dead code. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20251105162841.31786-1-jmoroni@google.com Acked-by: Tatyana Nikolova Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 101 +---------------------------- drivers/infiniband/hw/irdma/puda.c | 19 +----- drivers/infiniband/hw/irdma/type.h | 5 -- 3 files changed, 3 insertions(+), 122 deletions(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 4ef1c29032f7..57bf6815e71e 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, __le64 *wqe; struct irdma_sc_cqp *cqp; u64 hdr; - struct irdma_sc_ceq *ceq; - int ret_code = 0; cqp = cq->dev->cqp; if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt) @@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - ret_code = irdma_sc_add_cq_ctx(ceq, cq); - - if (ret_code) - return ret_code; - wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); - if (!wqe) { - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); + if (!wqe) return -ENOMEM; - } set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); @@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) struct irdma_sc_cqp *cqp; __le64 *wqe; u64 hdr; - struct irdma_sc_ceq *ceq; cqp = cq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) return -ENOMEM; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); - set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); set_64bit_val(wqe, 40, cq->shadow_area_pa); @@ -3601,71 +3584,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, return 0; } -/** - * irdma_sc_find_reg_cq - find cq ctx index - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq, - struct irdma_sc_cq *cq) -{ - u32 i; - - for (i = 0; i < ceq->reg_cq_size; i++) { - if (cq == ceq->reg_cq[i]) - return i; - } - - return IRDMA_INVALID_CQ_IDX; -} - -/** - * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - - if (ceq->reg_cq_size == ceq->elem_cnt) { - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - return -ENOMEM; - } - - ceq->reg_cq[ceq->reg_cq_size++] = cq; - - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - - return 0; -} - -/** - * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - u32 cq_ctx_idx; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq); - if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX) - goto exit; - - ceq->reg_cq_size--; - if (cq_ctx_idx != ceq->reg_cq_size) - ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size]; - ceq->reg_cq[ceq->reg_cq_size] = NULL; - -exit: - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); -} - /** * irdma_sc_cqp_init - Initialize buffers for a control Queue Pair * @cqp: IWARP control queue pair pointer @@ -4387,9 +4305,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, ceq->ceq_elem_pa = info->ceqe_pa; ceq->virtual_map = info->virtual_map; ceq->itr_no_expire = info->itr_no_expire; - ceq->reg_cq = info->reg_cq; - ceq->reg_cq_size = 0; - spin_lock_init(&ceq->req_cq_lock); ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0); ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0); ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL); @@ -4472,9 +4387,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) { struct irdma_sc_cqp *cqp; - if (ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq); - cqp = ceq->dev->cqp; cqp->process_cqp_sds = irdma_update_sds_noccq; @@ -4493,11 +4405,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) struct irdma_sc_dev *dev = ceq->dev; dev->ccq->vsi_idx = ceq->vsi_idx; - if (ceq->reg_cq) { - ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq); - if (ret_code) - return ret_code; - } ret_code = irdma_sc_ceq_create(ceq, scratch, true); if (!ret_code) @@ -4562,7 +4469,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) struct irdma_sc_cq *temp_cq; u8 polarity; u32 cq_idx; - unsigned long flags; do { cq_idx = 0; @@ -4583,11 +4489,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) } cq = temp_cq; - if (ceq->reg_cq) { - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_idx = irdma_sc_find_reg_cq(ceq, cq); - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - } IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 694e5a9ed15d..3c29e2289322 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -726,7 +726,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) struct irdma_sc_cqp *cqp; u64 hdr; struct irdma_ccq_cqe_info compl_info; - int status = 0; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0); @@ -756,16 +755,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16, 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); irdma_sc_cqp_post_sq(dev->cqp); - status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, - &compl_info); - if (!status) { - struct irdma_sc_ceq *ceq = dev->ceq[0]; - - if (ceq && ceq->reg_cq) - status = irdma_sc_add_cq_ctx(ceq, cq); - } - - return status; + return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, + &compl_info); } /** @@ -897,23 +888,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, struct irdma_puda_buf *buf = NULL; struct irdma_puda_buf *nextbuf = NULL; struct irdma_virt_mem *vmem; - struct irdma_sc_ceq *ceq; - ceq = vsi->dev->ceq[0]; switch (type) { case IRDMA_PUDA_RSRC_TYPE_ILQ: rsrc = vsi->ilq; vmem = &vsi->ilq_mem; vsi->ilq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; case IRDMA_PUDA_RSRC_TYPE_IEQ: rsrc = vsi->ieq; vmem = &vsi->ieq_mem; vsi->ieq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; default: ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n", diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 4ae77cdde9dc..aa09bb0bca33 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -492,9 +492,6 @@ struct irdma_sc_ceq { u32 first_pm_pbl_idx; u8 polarity; u16 vsi_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_size; - spinlock_t req_cq_lock; /* protect access to reg_cq array */ bool virtual_map:1; bool tph_en:1; bool itr_no_expire:1; @@ -894,8 +891,6 @@ struct irdma_ceq_init_info { u8 tph_val; u16 vsi_idx; u32 first_pm_pbl_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_idx; }; struct irdma_aeq_init_info { From 65d21dee533755ae8fff7450328c93599c36b092 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 7 Nov 2025 14:33:06 +0100 Subject: [PATCH 24/65] IB/iser: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251107133306.187939-1-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iscsi_iser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 2e3c0516ce8f..dc531fad73de 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -1029,7 +1029,7 @@ static int __init iser_init(void) mutex_init(&ig.connlist_mutex); INIT_LIST_HEAD(&ig.connlist); - release_wq = alloc_workqueue("release workqueue", 0, 0); + release_wq = alloc_workqueue("release workqueue", WQ_PERCPU, 0); if (!release_wq) { iser_err("failed to allocate release workqueue\n"); err = -ENOMEM; From 5c467151f6197dd7b8c36b33310b288a52bcca3d Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 7 Nov 2025 14:36:26 +0100 Subject: [PATCH 25/65] IB/isert: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20251107133626.190952-1-marco.crivellari@suse.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 42977a5326ee..af811d060cc8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2613,7 +2613,7 @@ static struct iscsit_transport iser_target_transport = { static int __init isert_init(void) { - isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0); + isert_login_wq = alloc_workqueue("isert_login_wq", WQ_PERCPU, 0); if (!isert_login_wq) { isert_err("Unable to allocate isert_login_wq\n"); return -ENOMEM; From a338d6e849ab31f32c08b4fcac11c0c72afbb150 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Mon, 10 Nov 2025 08:51:58 +0800 Subject: [PATCH 26/65] RDMA/rtrs: server: Fix error handling in get_or_create_srv After device_initialize() is called, use put_device() to release the device according to kernel device management rules. While direct kfree() work in this case, using put_device() is more correct. Found by code review. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Ma Ke Link: https://patch.msgid.link/20251110005158.13394-1-make24@iscas.ac.cn Acked-by: Jack Wang Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index ef4abdea3c2d..9ecc6343455d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1450,7 +1450,7 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx, kfree(srv->chunks); err_free_srv: - kfree(srv); + put_device(&srv->dev); return ERR_PTR(-ENOMEM); } From 4e5cba5bb6f37ceaba6a2628a171cbede02f969c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 11 Nov 2025 22:29:08 -0800 Subject: [PATCH 27/65] RDMA/cm: Correct typedef and bad line warnings In include/rdma/ib_cm.h: Correct a typedef's kernel-doc notation by adding the 'typedef' keyword to it to avoid a warning. Add a leading " *" to a kernel-doc line to avoid a warning. Warning: ib_cm.h:289 function parameter 'ib_cm_handler' not described in 'int' Warning: ib_cm.h:289 expecting prototype for ib_cm_handler(). Prototype was for int() instead Warning: ib_cm.h:484 bad line: connection message in case duplicates are received. Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251112062908.2711007-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_cm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 1fa3786f82f4..4808a355de41 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -271,7 +271,7 @@ struct ib_cm_event { #define CM_APR_ATTR_ID cpu_to_be16(0x001A) /** - * ib_cm_handler - User-defined callback to process communication events. + * typedef ib_cm_handler - User-defined callback to process communication events. * @cm_id: Communication identifier associated with the reported event. * @event: Information about the communication event. * @@ -482,7 +482,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, /** * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a - connection message in case duplicates are received. + * connection message in case duplicates are received. * @cm_id: Connection identifier associated with the connection message. */ int ib_prepare_cm_mra(struct ib_cm_id *cm_id); From a49a9f4555bde0b008f2225620ae1b2480d2b627 Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Wed, 12 Nov 2025 20:02:53 +0800 Subject: [PATCH 28/65] RDMA/irdma: Remove redundant NULL check of udata in irdma_create_user_ah() The variable udata cannot be NULL because irdma_create_user_ah() always receives it. Therefore, the if() check can be safely removed. Signed-off-by: Tuo Li Link: https://patch.msgid.link/20251112120253.68945-1-islituo@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 66ecaa0f5be6..5fb6014ab8cb 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -5210,7 +5210,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah, struct irdma_ah *parent_ah; int err; - if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) + if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) return -EINVAL; err = irdma_setup_ah(ibah, attr); From d43358cda7c4696e08880aaa58a7df82e471fa7c Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 13 Nov 2025 16:24:57 +0530 Subject: [PATCH 29/65] RDMA/restrack: Fix typos in the comments Fix couple of occurrences of the misspelled word "reource" in the comments with the correct spelling "resource". Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20251113105457.879903-1-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/restrack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index a7de6f403fca..b097cfcade1c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -175,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res, EXPORT_SYMBOL(rdma_restrack_new); /** - * rdma_restrack_add() - add object to the reource tracking database + * rdma_restrack_add() - add object to the resource tracking database * @res: resource entry */ void rdma_restrack_add(struct rdma_restrack_entry *res) @@ -277,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res) EXPORT_SYMBOL(rdma_restrack_put); /** - * rdma_restrack_del() - delete object from the reource tracking database + * rdma_restrack_del() - delete object from the resource tracking database * @res: resource entry */ void rdma_restrack_del(struct rdma_restrack_entry *res) From d056bc45b62b5981ebcd18c4303a915490b8ebe9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 13 Nov 2025 17:53:17 +0800 Subject: [PATCH 30/65] RDMA/core: Prevent soft lockup during large user memory region cleanup When a process exits with numerous large, pinned memory regions consisting of 4KB pages, the cleanup of the memory region through __ib_umem_release() may cause soft lockups. This is because unpin_user_page_range_dirty_lock() is called in a tight loop for unpin and releasing page without yielding the CPU. watchdog: BUG: soft lockup - CPU#44 stuck for 26s! [python3:73464] Kernel panic - not syncing: softlockup: hung tasks CPU: 44 PID: 73464 Comm: python3 Tainted: G OEL asm_sysvec_apic_timer_interrupt+0x1b/0x20 RIP: 0010:free_unref_page+0xff/0x190 ? free_unref_page+0xe3/0x190 __put_page+0x77/0xe0 put_compound_head+0xed/0x100 unpin_user_page_range_dirty_lock+0xb2/0x180 __ib_umem_release+0x57/0xb0 [ib_core] ib_umem_release+0x3f/0xd0 [ib_core] mlx5_ib_dereg_mr+0x2e9/0x440 [mlx5_ib] ib_dereg_mr_user+0x43/0xb0 [ib_core] uverbs_free_mr+0x15/0x20 [ib_uverbs] destroy_hw_idr_uobject+0x21/0x60 [ib_uverbs] uverbs_destroy_uobject+0x38/0x1b0 [ib_uverbs] __uverbs_cleanup_ufile+0xd1/0x150 [ib_uverbs] uverbs_destroy_ufile_hw+0x3f/0x100 [ib_uverbs] ib_uverbs_close+0x1f/0xb0 [ib_uverbs] __fput+0x9c/0x280 ____fput+0xe/0x20 task_work_run+0x6a/0xb0 do_exit+0x217/0x3c0 do_group_exit+0x3b/0xb0 get_signal+0x150/0x900 arch_do_signal_or_restart+0xde/0x100 exit_to_user_mode_loop+0xc4/0x160 exit_to_user_mode_prepare+0xa0/0xb0 syscall_exit_to_user_mode+0x27/0x50 do_syscall_64+0x63/0xb0 Fix soft lockup issues by incorporating cond_resched() calls within __ib_umem_release(), and this SG entries are typically grouped in 2MB chunks on x86_64, adding cond_resched() should has minimal performance impact. Signed-off-by: Li RongQing Link: https://patch.msgid.link/20251113095317.2628-1-lirongqing@baidu.com Acked-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c5b686394760..8fd84aa37289 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -55,9 +55,11 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, DMA_BIDIRECTIONAL, 0); - for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { unpin_user_page_range_dirty_lock(sg_page(sg), DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); + cond_resched(); + } sg_free_append_table(&umem->sgt_append); } From fecaa0c74f66643d5bd55422b426aefeaaaa25f2 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 17 Nov 2025 11:43:06 +0530 Subject: [PATCH 31/65] RDMA/bnxt_re: Fix wrong check for CQ coalesc support Driver is not creating the debugfs hooks for CQ coalesc parameters because of a wrong check. Fixed the condition check inside bnxt_re_init_cq_coal_debugfs(). Fixes: cf2749079011 ("RDMA/bnxt_re: Add a debugfs entry for CQE coalescing tuning") Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20251117061306.1140588-1-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index d03f5bb0d890..88817c86ae24 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -454,7 +454,7 @@ static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev) struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params; int i; - if (_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2)) + if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2)) return; dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL); From 8ac050ec3b1c0dcb5e89cf86fe2ebe0afcc73554 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 17 Nov 2025 17:11:19 +0000 Subject: [PATCH 32/65] bng_en: Add RoCE aux device support Add an auxiliary (aux) device to support RoCE. The base driver is responsible for creating the auxiliary device and allocating the required resources to it, which will be owned by the bnge RoCE driver in future patches. Signed-off-by: Vikas Gupta Link: https://patch.msgid.link/20251117171136.128193-2-siva.kallam@broadcom.com Reviewed-by: Siva Reddy Kallam Acked-by: Jakub Kicinski Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/broadcom/bnge/Makefile | 3 +- drivers/net/ethernet/broadcom/bnge/bnge.h | 10 + .../net/ethernet/broadcom/bnge/bnge_auxr.c | 258 ++++++++++++++++++ .../net/ethernet/broadcom/bnge/bnge_auxr.h | 84 ++++++ .../net/ethernet/broadcom/bnge/bnge_core.c | 18 +- .../net/ethernet/broadcom/bnge/bnge_hwrm.c | 40 +++ .../net/ethernet/broadcom/bnge/bnge_hwrm.h | 2 + .../net/ethernet/broadcom/bnge/bnge_resc.c | 12 + .../net/ethernet/broadcom/bnge/bnge_resc.h | 1 + 9 files changed, 426 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnge/bnge_auxr.c create mode 100644 drivers/net/ethernet/broadcom/bnge/bnge_auxr.h diff --git a/drivers/net/ethernet/broadcom/bnge/Makefile b/drivers/net/ethernet/broadcom/bnge/Makefile index 6142d9c57f49..ea6596854e5c 100644 --- a/drivers/net/ethernet/broadcom/bnge/Makefile +++ b/drivers/net/ethernet/broadcom/bnge/Makefile @@ -9,4 +9,5 @@ bng_en-y := bnge_core.o \ bnge_rmem.o \ bnge_resc.o \ bnge_netdev.o \ - bnge_ethtool.o + bnge_ethtool.o \ + bnge_auxr.o diff --git a/drivers/net/ethernet/broadcom/bnge/bnge.h b/drivers/net/ethernet/broadcom/bnge/bnge.h index 7aed5f81cd51..411744894349 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge.h @@ -11,6 +11,7 @@ #include #include "bnge_rmem.h" #include "bnge_resc.h" +#include "bnge_auxr.h" #define DRV_VER_MAJ 1 #define DRV_VER_MIN 15 @@ -22,6 +23,12 @@ enum board_idx { BCM57708, }; +struct bnge_auxr_priv { + struct auxiliary_device aux_dev; + struct bnge_auxr_dev *auxr_dev; + int id; +}; + struct bnge_pf_info { u16 fw_fid; u16 port_id; @@ -197,6 +204,9 @@ struct bnge_dev { struct bnge_irq *irq_tbl; u16 irqs_acquired; + + struct bnge_auxr_priv *aux_priv; + struct bnge_auxr_dev *auxr_dev; }; static inline bool bnge_is_roce_en(struct bnge_dev *bd) diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c new file mode 100644 index 000000000000..d64592b64e17 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bnge.h" +#include "bnge_hwrm.h" +#include "bnge_auxr.h" + +static DEFINE_IDA(bnge_aux_dev_ids); + +static void bnge_fill_msix_vecs(struct bnge_dev *bd, + struct bnge_msix_info *info) +{ + struct bnge_auxr_dev *auxr_dev = bd->auxr_dev; + int num_msix, i; + + if (!auxr_dev->auxr_info->msix_requested) { + dev_warn(bd->dev, "Requested MSI-X vectors not allocated\n"); + return; + } + num_msix = auxr_dev->auxr_info->msix_requested; + for (i = 0; i < num_msix; i++) { + info[i].vector = bd->irq_tbl[i].vector; + info[i].db_offset = bd->db_offset; + info[i].ring_idx = i; + } +} + +int bnge_register_dev(struct bnge_auxr_dev *auxr_dev, + void *handle) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct bnge_auxr_info *auxr_info; + int rc = 0; + + netdev_lock(bd->netdev); + mutex_lock(&auxr_dev->auxr_dev_lock); + if (!bd->irq_tbl) { + rc = -ENODEV; + goto exit; + } + + if (!bnge_aux_has_enough_resources(bd)) { + rc = -ENOMEM; + goto exit; + } + + auxr_info = auxr_dev->auxr_info; + auxr_info->handle = handle; + + auxr_info->msix_requested = bd->aux_num_msix; + + bnge_fill_msix_vecs(bd, bd->auxr_dev->msix_info); + auxr_dev->flags |= BNGE_ARDEV_MSIX_ALLOC; + +exit: + mutex_unlock(&auxr_dev->auxr_dev_lock); + netdev_unlock(bd->netdev); + return rc; +} +EXPORT_SYMBOL(bnge_register_dev); + +void bnge_unregister_dev(struct bnge_auxr_dev *auxr_dev) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct bnge_auxr_info *auxr_info; + + auxr_info = auxr_dev->auxr_info; + netdev_lock(bd->netdev); + mutex_lock(&auxr_dev->auxr_dev_lock); + if (auxr_info->msix_requested) + auxr_dev->flags &= ~BNGE_ARDEV_MSIX_ALLOC; + auxr_info->msix_requested = 0; + + mutex_unlock(&auxr_dev->auxr_dev_lock); + netdev_unlock(bd->netdev); +} +EXPORT_SYMBOL(bnge_unregister_dev); + +int bnge_send_msg(struct bnge_auxr_dev *auxr_dev, struct bnge_fw_msg *fw_msg) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct output *resp; + struct input *req; + u32 resp_len; + int rc; + + rc = bnge_hwrm_req_init(bd, req, 0 /* don't care */); + if (rc) + return rc; + + rc = bnge_hwrm_req_replace(bd, req, fw_msg->msg, fw_msg->msg_len); + if (rc) + goto drop_req; + + bnge_hwrm_req_timeout(bd, req, fw_msg->timeout); + resp = bnge_hwrm_req_hold(bd, req); + rc = bnge_hwrm_req_send(bd, req); + resp_len = le16_to_cpu(resp->resp_len); + if (resp_len) { + if (fw_msg->resp_max_len < resp_len) + resp_len = fw_msg->resp_max_len; + + memcpy(fw_msg->resp, resp, resp_len); + } +drop_req: + bnge_hwrm_req_drop(bd, req); + return rc; +} +EXPORT_SYMBOL(bnge_send_msg); + +void bnge_rdma_aux_device_uninit(struct bnge_dev *bd) +{ + struct bnge_auxr_priv *aux_priv; + struct auxiliary_device *adev; + + /* Skip if no auxiliary device init was done. */ + if (!bd->aux_priv) + return; + + aux_priv = bd->aux_priv; + adev = &aux_priv->aux_dev; + auxiliary_device_uninit(adev); +} + +static void bnge_aux_dev_release(struct device *dev) +{ + struct bnge_auxr_priv *aux_priv = + container_of(dev, struct bnge_auxr_priv, aux_dev.dev); + struct bnge_dev *bd = pci_get_drvdata(aux_priv->auxr_dev->pdev); + + ida_free(&bnge_aux_dev_ids, aux_priv->id); + kfree(aux_priv->auxr_dev->auxr_info); + bd->auxr_dev = NULL; + kfree(aux_priv->auxr_dev); + kfree(aux_priv); + bd->aux_priv = NULL; +} + +void bnge_rdma_aux_device_del(struct bnge_dev *bd) +{ + if (!bd->auxr_dev) + return; + + auxiliary_device_delete(&bd->aux_priv->aux_dev); +} + +static void bnge_set_auxr_dev_info(struct bnge_auxr_dev *auxr_dev, + struct bnge_dev *bd) +{ + auxr_dev->pdev = bd->pdev; + auxr_dev->l2_db_size = bd->db_size; + auxr_dev->l2_db_size_nc = bd->db_size; + auxr_dev->l2_db_offset = bd->db_offset; + mutex_init(&auxr_dev->auxr_dev_lock); + + if (bd->flags & BNGE_EN_ROCE_V1) + auxr_dev->flags |= BNGE_ARDEV_ROCEV1_SUPP; + if (bd->flags & BNGE_EN_ROCE_V2) + auxr_dev->flags |= BNGE_ARDEV_ROCEV2_SUPP; + + auxr_dev->chip_num = bd->chip_num; + auxr_dev->hw_ring_stats_size = bd->hw_ring_stats_size; + auxr_dev->pf_port_id = bd->pf.port_id; + auxr_dev->en_state = bd->state; + auxr_dev->bar0 = bd->bar0; +} + +void bnge_rdma_aux_device_add(struct bnge_dev *bd) +{ + struct auxiliary_device *aux_dev; + int rc; + + if (!bd->auxr_dev) + return; + + aux_dev = &bd->aux_priv->aux_dev; + rc = auxiliary_device_add(aux_dev); + if (rc) { + dev_warn(bd->dev, "Failed to add auxiliary device for ROCE\n"); + auxiliary_device_uninit(aux_dev); + bd->flags &= ~BNGE_EN_ROCE; + } + + bd->auxr_dev->net = bd->netdev; +} + +void bnge_rdma_aux_device_init(struct bnge_dev *bd) +{ + struct auxiliary_device *aux_dev; + struct bnge_auxr_info *auxr_info; + struct bnge_auxr_priv *aux_priv; + struct bnge_auxr_dev *auxr_dev; + int rc; + + if (!bnge_is_roce_en(bd)) + return; + + aux_priv = kzalloc(sizeof(*aux_priv), GFP_KERNEL); + if (!aux_priv) + goto exit; + + aux_priv->id = ida_alloc(&bnge_aux_dev_ids, GFP_KERNEL); + if (aux_priv->id < 0) { + dev_warn(bd->dev, "ida alloc failed for aux device\n"); + kfree(aux_priv); + goto exit; + } + + aux_dev = &aux_priv->aux_dev; + aux_dev->id = aux_priv->id; + aux_dev->name = "rdma"; + aux_dev->dev.parent = &bd->pdev->dev; + aux_dev->dev.release = bnge_aux_dev_release; + + rc = auxiliary_device_init(aux_dev); + if (rc) { + ida_free(&bnge_aux_dev_ids, aux_priv->id); + kfree(aux_priv); + goto exit; + } + bd->aux_priv = aux_priv; + + auxr_dev = kzalloc(sizeof(*auxr_dev), GFP_KERNEL); + if (!auxr_dev) + goto aux_dev_uninit; + + aux_priv->auxr_dev = auxr_dev; + + auxr_info = kzalloc(sizeof(*auxr_info), GFP_KERNEL); + if (!auxr_info) + goto aux_dev_uninit; + + auxr_dev->auxr_info = auxr_info; + bd->auxr_dev = auxr_dev; + bnge_set_auxr_dev_info(auxr_dev, bd); + + return; + +aux_dev_uninit: + auxiliary_device_uninit(aux_dev); +exit: + bd->flags &= ~BNGE_EN_ROCE; +} diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h new file mode 100644 index 000000000000..6c5c15ef2b0a --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Broadcom */ + +#ifndef _BNGE_AUXR_H_ +#define _BNGE_AUXR_H_ + +#include + +#define BNGE_MIN_ROCE_CP_RINGS 2 +#define BNGE_MIN_ROCE_STAT_CTXS 1 + +#define BNGE_MAX_ROCE_MSIX 64 + +struct hwrm_async_event_cmpl; +struct bnge; + +struct bnge_msix_info { + u32 vector; + u32 ring_idx; + u32 db_offset; +}; + +struct bnge_fw_msg { + void *msg; + int msg_len; + void *resp; + int resp_max_len; + int timeout; +}; + +struct bnge_auxr_info { + void *handle; + u16 msix_requested; +}; + +enum { + BNGE_ARDEV_ROCEV1_SUPP = BIT(0), + BNGE_ARDEV_ROCEV2_SUPP = BIT(1), + BNGE_ARDEV_MSIX_ALLOC = BIT(2), +}; + +#define BNGE_ARDEV_ROCE_SUPP (BNGE_ARDEV_ROCEV1_SUPP | \ + BNGE_ARDEV_ROCEV2_SUPP) + +struct bnge_auxr_dev { + struct net_device *net; + struct pci_dev *pdev; + void __iomem *bar0; + + struct bnge_msix_info msix_info[BNGE_MAX_ROCE_MSIX]; + + u32 flags; + + struct bnge_auxr_info *auxr_info; + + /* Doorbell BAR size in bytes mapped by L2 driver. */ + int l2_db_size; + /* Doorbell BAR size in bytes mapped as non-cacheable. */ + int l2_db_size_nc; + /* Doorbell offset in bytes within l2_db_size_nc. */ + int l2_db_offset; + + u16 chip_num; + u16 hw_ring_stats_size; + u16 pf_port_id; + unsigned long en_state; + + u16 auxr_num_msix_vec; + u16 auxr_num_ctxs; + + /* serialize auxr operations */ + struct mutex auxr_dev_lock; +}; + +void bnge_rdma_aux_device_uninit(struct bnge_dev *bdev); +void bnge_rdma_aux_device_del(struct bnge_dev *bdev); +void bnge_rdma_aux_device_add(struct bnge_dev *bdev); +void bnge_rdma_aux_device_init(struct bnge_dev *bdev); +int bnge_register_dev(struct bnge_auxr_dev *adev, + void *handle); +void bnge_unregister_dev(struct bnge_auxr_dev *adev); +int bnge_send_msg(struct bnge_auxr_dev *adev, struct bnge_fw_msg *fw_msg); + +#endif /* _BNGE_AUXR_H_ */ diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c index 2c72dd34d50d..c94e132bebc8 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c @@ -41,6 +41,11 @@ static void bnge_print_device_info(struct pci_dev *pdev, enum board_idx idx) bool bnge_aux_registered(struct bnge_dev *bd) { + struct bnge_auxr_dev *ba_dev = bd->auxr_dev; + + if (ba_dev && ba_dev->auxr_info->msix_requested) + return true; + return false; } @@ -312,16 +317,20 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) spin_lock_init(&bd->db_lock); #endif + bnge_rdma_aux_device_init(bd); + rc = bnge_alloc_irqs(bd); if (rc) { dev_err(&pdev->dev, "Error IRQ allocation rc = %d\n", rc); - goto err_config_uninit; + goto err_uninit_auxr; } rc = bnge_netdev_alloc(bd, max_irqs); if (rc) goto err_free_irq; + bnge_rdma_aux_device_add(bd); + pci_save_state(pdev); return 0; @@ -329,6 +338,9 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) err_free_irq: bnge_free_irqs(bd); +err_uninit_auxr: + bnge_rdma_aux_device_uninit(bd); + err_config_uninit: bnge_net_uninit_dflt_config(bd); @@ -354,10 +366,14 @@ static void bnge_remove_one(struct pci_dev *pdev) { struct bnge_dev *bd = pci_get_drvdata(pdev); + bnge_rdma_aux_device_del(bd); + bnge_netdev_free(bd); bnge_free_irqs(bd); + bnge_rdma_aux_device_uninit(bd); + bnge_net_uninit_dflt_config(bd); bnge_devlink_unregister(bd); diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c index 0f971af24142..c3087e5cd875 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c @@ -98,6 +98,46 @@ void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t gfp) ctx->gfp = gfp; } +int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req, + u32 len) +{ + struct bnge_hwrm_ctx *ctx = __hwrm_ctx_get(bd, req); + struct input *internal_req = req; + u16 req_type; + + if (!ctx) + return -EINVAL; + + if (len > BNGE_HWRM_CTX_OFFSET) + return -E2BIG; + + /* free any existing slices */ + ctx->allocated = BNGE_HWRM_DMA_SIZE - BNGE_HWRM_CTX_OFFSET; + if (ctx->slice_addr) { + dma_free_coherent(bd->dev, ctx->slice_size, + ctx->slice_addr, ctx->slice_handle); + ctx->slice_addr = NULL; + } + ctx->gfp = GFP_KERNEL; + + if ((bd->fw_cap & BNGE_FW_CAP_SHORT_CMD) || len > BNGE_HWRM_MAX_REQ_LEN) { + memcpy(internal_req, new_req, len); + } else { + internal_req->req_type = ((struct input *)new_req)->req_type; + ctx->req = new_req; + } + + ctx->req_len = len; + ctx->req->resp_addr = cpu_to_le64(ctx->dma_handle + + BNGE_HWRM_RESP_OFFSET); + + /* update sentinel for potentially new request type */ + req_type = le16_to_cpu(internal_req->req_type); + ctx->sentinel = bnge_cal_sentinel(ctx, req_type); + + return 0; +} + void bnge_hwrm_req_flags(struct bnge_dev *bd, void *req, enum bnge_hwrm_ctx_flags flags) { diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h index 83794a12cc81..6df629761d95 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h @@ -107,4 +107,6 @@ int bnge_hwrm_req_send_silent(struct bnge_dev *bd, void *req); void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t flags); void *bnge_hwrm_req_dma_slice(struct bnge_dev *bd, void *req, u32 size, dma_addr_t *dma); +int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req, + u32 len); #endif /* _BNGE_HWRM_H_ */ diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c index 62ebe03a0dcf..943df5f60f01 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c @@ -34,6 +34,18 @@ static unsigned int bnge_get_max_func_stat_ctxs(struct bnge_dev *bd) return bd->hw_resc.max_stat_ctxs; } +bool bnge_aux_has_enough_resources(struct bnge_dev *bd) +{ + unsigned int max_stat_ctxs; + + max_stat_ctxs = bnge_get_max_func_stat_ctxs(bd); + if (max_stat_ctxs <= BNGE_MIN_ROCE_STAT_CTXS || + bd->nq_nr_rings == max_stat_ctxs) + return false; + + return true; +} + static unsigned int bnge_get_max_func_cp_rings(struct bnge_dev *bd) { return bd->hw_resc.max_cp_rings; diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h index 0d6213b27580..b62a634669f6 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h @@ -74,6 +74,7 @@ void bnge_net_uninit_dflt_config(struct bnge_dev *bd); void bnge_aux_init_dflt_config(struct bnge_dev *bd); u32 bnge_get_rxfh_indir_size(struct bnge_dev *bd); int bnge_cal_nr_rss_ctxs(u16 rx_rings); +bool bnge_aux_has_enough_resources(struct bnge_dev *bd); static inline u32 bnge_adjust_pow_two(u32 total_ent, u16 ent_per_blk) From d0da769c19d0b2da5146cad226e16ab7f282adde Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:20 +0000 Subject: [PATCH 33/65] RDMA/bng_re: Add Auxiliary interface Add basic Auxiliary interface to the driver which supports the BCM5770X NIC family. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-3-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- MAINTAINERS | 7 ++ drivers/infiniband/Kconfig | 1 + drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/bng_re/Kconfig | 10 ++ drivers/infiniband/hw/bng_re/Makefile | 7 ++ drivers/infiniband/hw/bng_re/bng_dev.c | 137 +++++++++++++++++++++++++ drivers/infiniband/hw/bng_re/bng_re.h | 26 +++++ 7 files changed, 189 insertions(+) create mode 100644 drivers/infiniband/hw/bng_re/Kconfig create mode 100644 drivers/infiniband/hw/bng_re/Makefile create mode 100644 drivers/infiniband/hw/bng_re/bng_dev.c create mode 100644 drivers/infiniband/hw/bng_re/bng_re.h diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..1f7aa2a3bec1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5191,6 +5191,13 @@ W: http://www.broadcom.com F: drivers/infiniband/hw/bnxt_re/ F: include/uapi/rdma/bnxt_re-abi.h +BROADCOM 800 GIGABIT ROCE DRIVER +M: Siva Reddy Kallam +L: linux-rdma@vger.kernel.org +S: Supported +W: http://www.broadcom.com +F: drivers/infiniband/hw/bng_re/ + BROADCOM NVRAM DRIVER M: Rafał Miłecki L: linux-mips@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index f0323f1d6f01..794b9778816b 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -80,6 +80,7 @@ config INFINIBAND_VIRT_DMA if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS if !UML source "drivers/infiniband/hw/bnxt_re/Kconfig" +source "drivers/infiniband/hw/bng_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" source "drivers/infiniband/hw/erdma/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index b706dc0d0263..c42b22ac3303 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ obj-$(CONFIG_INFINIBAND_IONIC) += ionic/ diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig new file mode 100644 index 000000000000..85845f72c64d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_BNG_RE + tristate "Broadcom Next generation RoCE HCA support" + depends on 64BIT + depends on INET && DCB && BNGE + help + This driver supports Broadcom Next generation + 50/100/200/400/800 gigabit RoCE HCAs. The module + will be called bng_re. To compile this driver + as a module, choose M here. diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile new file mode 100644 index 000000000000..f854dae25b1c --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge + +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o + +bng_re-y := bng_dev.o diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c new file mode 100644 index 000000000000..a4388a87d828 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include +#include +#include + +#include + +#include "bng_re.h" +#include "bnge.h" +#include "bnge_auxr.h" + +MODULE_AUTHOR("Siva Reddy Kallam "); +MODULE_DESCRIPTION(BNG_RE_DESC); +MODULE_LICENSE("Dual BSD/GPL"); + +static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev, + struct bnge_auxr_dev *aux_dev) +{ + struct bng_re_dev *rdev; + + /* Allocate bng_re_dev instance */ + rdev = ib_alloc_device(bng_re_dev, ibdev); + if (!rdev) { + pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME); + return NULL; + } + + /* Assign auxiliary device specific data */ + rdev->netdev = aux_dev->net; + rdev->aux_dev = aux_dev; + rdev->adev = adev; + rdev->fn_id = rdev->aux_dev->pdev->devfn; + + return rdev; +} + +static int bng_re_add_device(struct auxiliary_device *adev) +{ + struct bnge_auxr_priv *auxr_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *dev_info; + struct bng_re_dev *rdev; + int rc; + + dev_info = auxiliary_get_drvdata(adev); + + rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev); + if (!rdev) { + rc = -ENOMEM; + goto exit; + } + + dev_info->rdev = rdev; + + return 0; +exit: + return rc; +} + + +static void bng_re_remove_device(struct bng_re_dev *rdev, + struct auxiliary_device *aux_dev) +{ + ib_dealloc_device(&rdev->ibdev); +} + + +static int bng_re_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct bnge_auxr_priv *aux_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *en_info; + int rc; + + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) + return -ENOMEM; + + en_info->auxr_dev = aux_priv->auxr_dev; + + auxiliary_set_drvdata(adev, en_info); + + rc = bng_re_add_device(adev); + if (rc) + kfree(en_info); + return rc; +} + +static void bng_re_remove(struct auxiliary_device *adev) +{ + struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev); + struct bng_re_dev *rdev; + + rdev = dev_info->rdev; + + if (rdev) + bng_re_remove_device(rdev, adev); + kfree(dev_info); +} + +static const struct auxiliary_device_id bng_re_id_table[] = { + { .name = BNG_RE_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table); + +static struct auxiliary_driver bng_re_driver = { + .name = "rdma", + .probe = bng_re_probe, + .remove = bng_re_remove, + .id_table = bng_re_id_table, +}; + +static int __init bng_re_mod_init(void) +{ + int rc; + + + rc = auxiliary_driver_register(&bng_re_driver); + if (rc) { + pr_err("%s: Failed to register auxiliary driver\n", + KBUILD_MODNAME); + } + return rc; +} + +static void __exit bng_re_mod_exit(void) +{ + auxiliary_driver_unregister(&bng_re_driver); +} + +module_init(bng_re_mod_init); +module_exit(bng_re_mod_exit); diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h new file mode 100644 index 000000000000..5341ffe22724 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_H__ +#define __BNG_RE_H__ + +#define BNG_RE_ADEV_NAME "bng_en" + +#define BNG_RE_DESC "Broadcom 800G RoCE Driver" + +#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL) + +struct bng_re_en_dev_info { + struct bng_re_dev *rdev; + struct bnge_auxr_dev *auxr_dev; +}; + +struct bng_re_dev { + struct ib_device ibdev; + struct net_device *netdev; + struct auxiliary_device *adev; + struct bnge_auxr_dev *aux_dev; + int fn_id; +}; + +#endif From 745065770c2dc9636f33ec5fb065ffb7d227f4ad Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:21 +0000 Subject: [PATCH 34/65] RDMA/bng_re: Register and get the resources from bnge driver Register and get the basic required resources from bnge driver. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-4-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_dev.c | 149 +++++++++++++++++++++++++ drivers/infiniband/hw/bng_re/bng_re.h | 5 + drivers/infiniband/hw/bng_re/bng_res.h | 16 +++ 3 files changed, 170 insertions(+) create mode 100644 drivers/infiniband/hw/bng_re/bng_res.h diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index a4388a87d828..77d33c875837 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -7,8 +7,10 @@ #include +#include "bng_res.h" #include "bng_re.h" #include "bnge.h" +#include "bnge_hwrm.h" #include "bnge_auxr.h" MODULE_AUTHOR("Siva Reddy Kallam "); @@ -36,6 +38,144 @@ static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev, return rdev; } + +static int bng_re_register_netdev(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev; + + aux_dev = rdev->aux_dev; + return bnge_register_dev(aux_dev, rdev->adev); +} + +static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + + if (!rdev->chip_ctx) + return; + + chip_ctx = rdev->chip_ctx; + rdev->chip_ctx = NULL; + kfree(chip_ctx); +} + +static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + struct bnge_auxr_dev *aux_dev; + + aux_dev = rdev->aux_dev; + + chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); + if (!chip_ctx) + return -ENOMEM; + chip_ctx->chip_num = aux_dev->chip_num; + chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size; + + rdev->chip_ctx = chip_ctx; + + return 0; +} + +static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd) +{ + hdr->req_type = cpu_to_le16(opcd); + hdr->cmpl_ring = cpu_to_le16(-1); + hdr->target_id = cpu_to_le16(-1); +} + +static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg, + int msg_len, void *resp, int resp_max_len, + int timeout) +{ + fw_msg->msg = msg; + fw_msg->msg_len = msg_len; + fw_msg->resp = resp; + fw_msg->resp_max_len = resp_max_len; + fw_msg->timeout = timeout; +} + +static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ver_get_output ver_get_resp = {}; + struct hwrm_ver_get_input ver_get_req = {}; + struct bng_re_chip_ctx *cctx; + struct bnge_fw_msg fw_msg = {}; + int rc; + + bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET); + ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR; + ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR; + ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE; + bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req), + (void *)&ver_get_resp, sizeof(ver_get_resp), + BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x", + rc); + return; + } + + cctx = rdev->chip_ctx; + cctx->hwrm_intf_ver = + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 | + le16_to_cpu(ver_get_resp.hwrm_intf_patch); + + cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout); + + if (!cctx->hwrm_cmd_max_timeout) + cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT; +} + +static int bng_re_dev_init(struct bng_re_dev *rdev) +{ + int rc; + + /* Registered a new RoCE device instance to netdev */ + rc = bng_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } + + set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + + if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) { + ibdev_err(&rdev->ibdev, + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->aux_dev->auxr_info->msix_requested); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -EINVAL; + } + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->aux_dev->auxr_info->msix_requested); + + rc = bng_re_setup_chip_ctx(rdev); + if (rc) { + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + ibdev_err(&rdev->ibdev, "Failed to get chip context\n"); + return -EINVAL; + } + + bng_re_query_hwrm_version(rdev); + + return 0; +} + +static void bng_re_dev_uninit(struct bng_re_dev *rdev) +{ + bng_re_destroy_chip_ctx(rdev); + if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnge_unregister_dev(rdev->aux_dev); +} + static int bng_re_add_device(struct auxiliary_device *adev) { struct bnge_auxr_priv *auxr_priv = @@ -54,7 +194,14 @@ static int bng_re_add_device(struct auxiliary_device *adev) dev_info->rdev = rdev; + rc = bng_re_dev_init(rdev); + if (rc) + goto re_dev_dealloc; + return 0; + +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); exit: return rc; } @@ -63,6 +210,7 @@ static int bng_re_add_device(struct auxiliary_device *adev) static void bng_re_remove_device(struct bng_re_dev *rdev, struct auxiliary_device *aux_dev) { + bng_re_dev_uninit(rdev); ib_dealloc_device(&rdev->ibdev); } @@ -86,6 +234,7 @@ static int bng_re_probe(struct auxiliary_device *adev, rc = bng_re_add_device(adev); if (rc) kfree(en_info); + return rc; } diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index 5341ffe22724..f31eec133fa1 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -10,6 +10,8 @@ #define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL) +#define BNG_RE_MIN_MSIX 2 + struct bng_re_en_dev_info { struct bng_re_dev *rdev; struct bnge_auxr_dev *auxr_dev; @@ -17,9 +19,12 @@ struct bng_re_en_dev_info { struct bng_re_dev { struct ib_device ibdev; + unsigned long flags; +#define BNG_RE_FLAG_NETDEV_REGISTERED 0 struct net_device *netdev; struct auxiliary_device *adev; struct bnge_auxr_dev *aux_dev; + struct bng_re_chip_ctx *chip_ctx; int fn_id; }; diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h new file mode 100644 index 000000000000..d64833498e2a --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RES_H__ +#define __BNG_RES_H__ + +#define BNG_ROCE_FW_MAX_TIMEOUT 60 + +struct bng_re_chip_ctx { + u16 chip_num; + u16 hw_stats_size; + u64 hwrm_intf_ver; + u16 hwrm_cmd_max_timeout; +}; + +#endif From 53310b698f3cf601dfdc6c3b2b60c7cb275b1199 Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:22 +0000 Subject: [PATCH 35/65] RDMA/bng_re: Allocate required memory resources for Firmware channel Allocate required memory resources for Firmware channel. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-5-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/Makefile | 6 +- drivers/infiniband/hw/bng_re/bng_dev.c | 34 +++- drivers/infiniband/hw/bng_re/bng_fw.c | 70 +++++++ drivers/infiniband/hw/bng_re/bng_fw.h | 69 +++++++ drivers/infiniband/hw/bng_re/bng_re.h | 2 + drivers/infiniband/hw/bng_re/bng_res.c | 250 +++++++++++++++++++++++++ drivers/infiniband/hw/bng_re/bng_res.h | 76 ++++++++ 7 files changed, 496 insertions(+), 11 deletions(-) create mode 100644 drivers/infiniband/hw/bng_re/bng_fw.c create mode 100644 drivers/infiniband/hw/bng_re/bng_fw.h create mode 100644 drivers/infiniband/hw/bng_re/bng_res.c diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile index f854dae25b1c..1b957defbabc 100644 --- a/drivers/infiniband/hw/bng_re/Makefile +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 - -ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge +ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o -bng_re-y := bng_dev.o +bng_re-y := bng_dev.o bng_fw.o \ + bng_res.o diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index 77d33c875837..d0126bbecbec 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -8,6 +8,7 @@ #include #include "bng_res.h" +#include "bng_fw.h" #include "bng_re.h" #include "bnge.h" #include "bnge_hwrm.h" @@ -56,6 +57,9 @@ static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) chip_ctx = rdev->chip_ctx; rdev->chip_ctx = NULL; + rdev->rcfw.res = NULL; + rdev->bng_res.cctx = NULL; + rdev->bng_res.pdev = NULL; kfree(chip_ctx); } @@ -65,7 +69,8 @@ static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) struct bnge_auxr_dev *aux_dev; aux_dev = rdev->aux_dev; - + rdev->bng_res.pdev = aux_dev->pdev; + rdev->rcfw.res = &rdev->bng_res; chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; @@ -73,6 +78,7 @@ static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size; rdev->chip_ctx = chip_ctx; + rdev->bng_res.cctx = rdev->chip_ctx; return 0; } @@ -131,6 +137,14 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT; } +static void bng_re_dev_uninit(struct bng_re_dev *rdev) +{ + bng_re_free_rcfw_channel(&rdev->rcfw); + bng_re_destroy_chip_ctx(rdev); + if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnge_unregister_dev(rdev->aux_dev); +} + static int bng_re_dev_init(struct bng_re_dev *rdev) { int rc; @@ -166,14 +180,18 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) bng_re_query_hwrm_version(rdev); - return 0; -} + rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate RCFW Channel: %#x\n", rc); + goto fail; + } -static void bng_re_dev_uninit(struct bng_re_dev *rdev) -{ - bng_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) - bnge_unregister_dev(rdev->aux_dev); + return 0; + +fail: + bng_re_dev_uninit(rdev); + return rc; } static int bng_re_add_device(struct auxiliary_device *adev) diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c new file mode 100644 index 000000000000..bf7bbcf9b56e --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include + +#include "bng_res.h" +#include "bng_fw.h" + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + kfree(rcfw->crsqe_tbl); + bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq); + bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq); + rcfw->pdev = NULL; +} + +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw) +{ + struct bng_re_hwq_attr hwq_attr = {}; + struct bng_re_sg_info sginfo = {}; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + + rcfw->pdev = res->pdev; + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + rcfw->res = res; + + sginfo.pgsize = PAGE_SIZE; + sginfo.pgshft = PAGE_SHIFT; + + hwq_attr.sginfo = &sginfo; + hwq_attr.res = rcfw->res; + hwq_attr.depth = BNG_FW_CREQE_MAX_CNT; + hwq_attr.stride = BNG_FW_CREQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_QUEUE; + + if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CREQ allocation failed\n"); + goto fail; + } + + rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT; + + sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth); + hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; + hwq_attr.stride = BNG_FW_CMDQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_CTX; + if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CMDQ allocation failed\n"); + goto fail; + } + + rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements, + sizeof(*rcfw->crsqe_tbl), GFP_KERNEL); + if (!rcfw->crsqe_tbl) + goto fail; + + spin_lock_init(&rcfw->tbl_lock); + + rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; + + return 0; + +fail: + bng_re_free_rcfw_channel(rcfw); + return -ENOMEM; +} diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h new file mode 100644 index 000000000000..351f73baa9df --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_FW_H__ +#define __BNG_FW_H__ + +/* CREQ */ +#define BNG_FW_CREQE_MAX_CNT (64 * 1024) +#define BNG_FW_CREQE_UNITS 16 + +/* CMDQ */ +struct bng_fw_cmdqe { + u8 data[16]; +}; + +#define BNG_FW_CMDQE_MAX_CNT 8192 +#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe) +#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS) + +static inline u32 bng_fw_cmdqe_npages(u32 depth) +{ + u32 npages; + + npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE; + if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE) + npages++; + return npages; +} + +static inline u32 bng_fw_cmdqe_page_size(u32 depth) +{ + return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE); +} + +/* HWQ */ +struct bng_re_cmdq_ctx { + struct bng_re_hwq hwq; +}; + +struct bng_re_creq_ctx { + struct bng_re_hwq hwq; +}; + +struct bng_re_crsqe { + struct creq_qp_event *resp; + u32 req_size; + /* Free slots at the time of submission */ + u32 free_slots; + u8 opcode; +}; + +/* RoCE FW Communication Channels */ +struct bng_re_rcfw { + struct pci_dev *pdev; + struct bng_re_res *res; + struct bng_re_cmdq_ctx cmdq; + struct bng_re_creq_ctx creq; + struct bng_re_crsqe *crsqe_tbl; + /* To synchronize the qp-handle hash table */ + spinlock_t tbl_lock; + u32 cmdq_depth; + /* cached from chip cctx for quick reference in slow path */ + u16 max_timeout; +}; + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index f31eec133fa1..170434a75dea 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -26,6 +26,8 @@ struct bng_re_dev { struct bnge_auxr_dev *aux_dev; struct bng_re_chip_ctx *chip_ctx; int fn_id; + struct bng_re_res bng_res; + struct bng_re_rcfw rcfw; }; #endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c new file mode 100644 index 000000000000..2119d1f39b65 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include +#include +#include + +#include "bng_res.h" +#include "roce_hsi.h" + +static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl) +{ + struct pci_dev *pdev = res->pdev; + int i; + + for (i = 0; i < pbl->pg_count; i++) { + if (pbl->pg_arr[i]) + dma_free_coherent(&pdev->dev, pbl->pg_size, + (void *)((unsigned long) + pbl->pg_arr[i] & + PAGE_MASK), + pbl->pg_map_arr[i]); + else + dev_warn(&pdev->dev, + "PBL free pg_arr[%d] empty?!\n", i); + pbl->pg_arr[i] = NULL; + } + + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + vfree(pbl->pg_map_arr); + pbl->pg_map_arr = NULL; + pbl->pg_count = 0; + pbl->pg_size = 0; +} + +static int bng_alloc_pbl(struct bng_re_res *res, + struct bng_re_pbl *pbl, + struct bng_re_sg_info *sginfo) +{ + struct pci_dev *pdev = res->pdev; + u32 pages; + int i; + + if (sginfo->nopte) + return 0; + pages = sginfo->npages; + + /* page ptr arrays */ + pbl->pg_arr = vmalloc_array(pages, sizeof(void *)); + if (!pbl->pg_arr) + return -ENOMEM; + + pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t)); + if (!pbl->pg_map_arr) { + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + return -ENOMEM; + } + pbl->pg_count = 0; + pbl->pg_size = sginfo->pgsize; + + for (i = 0; i < pages; i++) { + pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev, + pbl->pg_size, + &pbl->pg_map_arr[i], + GFP_KERNEL); + if (!pbl->pg_arr[i]) + goto fail; + pbl->pg_count++; + } + + return 0; +fail: + bng_free_pbl(res, pbl); + return -ENOMEM; +} + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq) +{ + int i; + + if (!hwq->max_elements) + return; + if (hwq->level >= BNG_PBL_LVL_MAX) + return; + + for (i = 0; i < hwq->level + 1; i++) + bng_free_pbl(res, &hwq->pbl[i]); + + hwq->level = BNG_PBL_LVL_MAX; + hwq->max_elements = 0; + hwq->element_size = 0; + hwq->prod = 0; + hwq->cons = 0; +} + +/* All HWQs are power of 2 in size */ +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr) +{ + u32 npages, pg_size; + struct bng_re_sg_info sginfo = {}; + u32 depth, stride, npbl, npde; + dma_addr_t *src_phys_ptr, **dst_virt_ptr; + struct bng_re_res *res; + struct pci_dev *pdev; + int i, rc, lvl; + + res = hwq_attr->res; + pdev = res->pdev; + pg_size = hwq_attr->sginfo->pgsize; + hwq->level = BNG_PBL_LVL_MAX; + + depth = roundup_pow_of_two(hwq_attr->depth); + stride = roundup_pow_of_two(hwq_attr->stride); + + npages = (depth * stride) / pg_size; + if ((depth * stride) % pg_size) + npages++; + if (!npages) + return -EINVAL; + hwq_attr->sginfo->npages = npages; + + if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { + /* This request is Level 0, map PTE */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_0; + goto done; + } + + if (npages >= MAX_PBL_LVL_0_PGS) { + if (npages > MAX_PBL_LVL_1_PGS) { + u32 flag = PTU_PTE_VALID; + /* 2 levels of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + npde = npbl >> MAX_PDL_LVL_SHIFT; + if (npbl % BIT(MAX_PDL_LVL_SHIFT)) + npde++; + /* Alloc PDE pages */ + sginfo.pgsize = npde * pg_size; + sginfo.npages = 1; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + + /* Alloc PBL pages */ + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo); + if (rc) + goto fail; + /* Fill PDL with PBL page pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_2; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBLs with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) { + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | PTU_PTE_VALID; + } + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_2].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } else { /* pages < 512 npbl = 1, npde = 0 */ + u32 flag = PTU_PTE_VALID; + + /* 1 level of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + /* Alloc PBL page */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_1; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBL with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | flag; + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_1].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } + } +done: + hwq->prod = 0; + hwq->cons = 0; + hwq->pdev = pdev; + hwq->depth = hwq_attr->depth; + hwq->max_elements = hwq->depth; + hwq->element_size = stride; + /* For direct access to the elements */ + lvl = hwq->level; + if (hwq_attr->sginfo->nopte && hwq->level) + lvl = hwq->level - 1; + hwq->pbl_ptr = hwq->pbl[lvl].pg_arr; + hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr; + spin_lock_init(&hwq->lock); + + return 0; +fail: + bng_re_free_hwq(res, hwq); + return -ENOMEM; +} diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h index d64833498e2a..e6123abadfad 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.h +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -6,6 +6,18 @@ #define BNG_ROCE_FW_MAX_TIMEOUT 60 +#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) +#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1) +#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) +#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG) + +#define MAX_PBL_LVL_0_PGS 1 +#define MAX_PBL_LVL_1_PGS 512 +#define MAX_PBL_LVL_1_PGS_SHIFT 9 +#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256 +#define MAX_PBL_LVL_2_PGS (256 * 512) +#define MAX_PDL_LVL_SHIFT 9 + struct bng_re_chip_ctx { u16 chip_num; u16 hw_stats_size; @@ -13,4 +25,68 @@ struct bng_re_chip_ctx { u16 hwrm_cmd_max_timeout; }; +struct bng_re_pbl { + u32 pg_count; + u32 pg_size; + void **pg_arr; + dma_addr_t *pg_map_arr; +}; + +enum bng_re_pbl_lvl { + BNG_PBL_LVL_0, + BNG_PBL_LVL_1, + BNG_PBL_LVL_2, + BNG_PBL_LVL_MAX +}; + +enum bng_re_hwq_type { + BNG_HWQ_TYPE_CTX, + BNG_HWQ_TYPE_QUEUE +}; + +struct bng_re_sg_info { + u32 npages; + u32 pgshft; + u32 pgsize; + bool nopte; +}; + +struct bng_re_hwq_attr { + struct bng_re_res *res; + struct bng_re_sg_info *sginfo; + enum bng_re_hwq_type type; + u32 depth; + u32 stride; + u32 aux_stride; + u32 aux_depth; +}; + +struct bng_re_hwq { + struct pci_dev *pdev; + /* lock to protect hwq */ + spinlock_t lock; + struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1]; + /* Valid values: 0, 1, 2 */ + enum bng_re_pbl_lvl level; + /* PBL entries */ + void **pbl_ptr; + /* PBL dma_addr */ + dma_addr_t *pbl_dma_ptr; + u32 max_elements; + u32 depth; + u16 element_size; + u32 prod; + u32 cons; +}; + +struct bng_re_res { + struct pci_dev *pdev; + struct bng_re_chip_ctx *cctx; +}; + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq); + +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr); #endif From 4f830cd8d7fe3e98fc12d25f347ed461e11fc1de Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:23 +0000 Subject: [PATCH 36/65] RDMA/bng_re: Add infrastructure for enabling Firmware channel Add infrastructure for enabling Firmware channel. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-6-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_dev.c | 120 +++++++- drivers/infiniband/hw/bng_re/bng_fw.c | 361 ++++++++++++++++++++++++- drivers/infiniband/hw/bng_re/bng_fw.h | 133 ++++++++- drivers/infiniband/hw/bng_re/bng_re.h | 45 +++ drivers/infiniband/hw/bng_re/bng_res.c | 2 + drivers/infiniband/hw/bng_re/bng_res.h | 102 +++++++ drivers/infiniband/hw/bng_re/bng_tlv.h | 128 +++++++++ 7 files changed, 885 insertions(+), 6 deletions(-) create mode 100644 drivers/infiniband/hw/bng_re/bng_tlv.h diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index d0126bbecbec..12f86a0e7985 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -9,10 +9,10 @@ #include "bng_res.h" #include "bng_fw.h" -#include "bng_re.h" #include "bnge.h" -#include "bnge_hwrm.h" #include "bnge_auxr.h" +#include "bng_re.h" +#include "bnge_hwrm.h" MODULE_AUTHOR("Siva Reddy Kallam "); MODULE_DESCRIPTION(BNG_RE_DESC); @@ -101,6 +101,69 @@ static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } +static int bng_re_net_ring_free(struct bng_re_dev *rdev, + u16 fw_ring_id, int type) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_free_input req = {}; + struct hwrm_ring_free_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!rdev) + return rc; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); + req.ring_type = type; + req.ring_id = cpu_to_le16(fw_ring_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x", + req.ring_id, rc); + return rc; +} + +static int bng_re_net_ring_alloc(struct bng_re_dev *rdev, + struct bng_re_ring_attr *ring_attr, + u16 *fw_ring_id) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_alloc_input req = {}; + struct hwrm_ring_alloc_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); + req.enables = 0; + req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); + if (ring_attr->pages > 1) { + /* Page size is in log2 units */ + req.page_size = BNGE_PAGE_SHIFT; + req.page_tbl_depth = 1; + } + req.fbo = 0; + /* Association of ring index with doorbell index and MSIX number */ + req.logical_id = cpu_to_le16(ring_attr->lrid); + req.length = cpu_to_le32(ring_attr->depth + 1); + req.ring_type = ring_attr->type; + req.int_mode = ring_attr->mode; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + *fw_ring_id = le16_to_cpu(resp.ring_id); + + return rc; +} + static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) { struct bnge_auxr_dev *aux_dev = rdev->aux_dev; @@ -139,7 +202,13 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) static void bng_re_dev_uninit(struct bng_re_dev *rdev) { + bng_re_disable_rcfw_channel(&rdev->rcfw); + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, + RING_ALLOC_REQ_RING_TYPE_NQ); bng_re_free_rcfw_channel(&rdev->rcfw); + + kfree(rdev->nqr); + rdev->nqr = NULL; bng_re_destroy_chip_ctx(rdev); if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) bnge_unregister_dev(rdev->aux_dev); @@ -147,6 +216,11 @@ static void bng_re_dev_uninit(struct bng_re_dev *rdev) static int bng_re_dev_init(struct bng_re_dev *rdev) { + struct bng_re_ring_attr rattr = {}; + struct bng_re_creq_ctx *creq; + u32 db_offt; + int vid; + u8 type; int rc; /* Registered a new RoCE device instance to netdev */ @@ -187,8 +261,48 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) goto fail; } - return 0; + /* Allocate nq record memory */ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) { + bng_re_destroy_chip_ctx(rdev); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -ENOMEM; + } + rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info, + sizeof(struct bnge_msix_info) * rdev->nqr->num_msix); + + type = RING_ALLOC_REQ_RING_TYPE_NQ; + creq = &rdev->rcfw.creq; + rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr; + rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count; + rattr.type = type; + rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; + rattr.depth = BNG_FW_CREQE_MAX_CNT - 1; + rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx; + rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); + goto free_rcfw; + } + db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector; + + rc = bng_re_enable_fw_channel(&rdev->rcfw, + vid, db_offt); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n", + rc); + goto free_ring; + } + + return 0; +free_ring: + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); +free_rcfw: + bng_re_free_rcfw_channel(&rdev->rcfw); fail: bng_re_dev_uninit(rdev); return rc; diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c index bf7bbcf9b56e..1bfed58e6703 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.c +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -2,6 +2,7 @@ // Copyright (c) 2025 Broadcom. #include +#include "roce_hsi.h" #include "bng_res.h" #include "bng_fw.h" @@ -61,10 +62,368 @@ int bng_re_alloc_fw_channel(struct bng_re_res *res, spin_lock_init(&rcfw->tbl_lock); rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; - return 0; fail: bng_re_free_rcfw_channel(rcfw); return -ENOMEM; } + +static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw, + struct creq_qp_event *qp_event, + u32 *num_wait) +{ + struct bng_re_hwq *hwq = &rcfw->cmdq.hwq; + struct bng_re_crsqe *crsqe; + u32 req_size; + u16 cookie; + bool is_waiter_alive; + struct pci_dev *pdev; + u32 wait_cmds = 0; + int rc = 0; + + pdev = rcfw->pdev; + switch (qp_event->event) { + case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: + dev_err(&pdev->dev, "Received QP error notification\n"); + break; + default: + /* + * Command Response + * cmdq->lock needs to be acquired to synchronie + * the command send and completion reaping. This function + * is always called with creq->lock held. Using + * the nested variant of spin_lock. + * + */ + + spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING); + cookie = le16_to_cpu(qp_event->cookie); + cookie &= BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock(&hwq->lock); + return rc; + } + + if (crsqe->is_waiter_alive) { + if (crsqe->resp) { + memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); + /* Insert write memory barrier to ensure that + * response data is copied before clearing the + * flags + */ + smp_wmb(); + } + } + + wait_cmds++; + + req_size = crsqe->req_size; + is_waiter_alive = crsqe->is_waiter_alive; + + crsqe->req_size = 0; + if (!is_waiter_alive) + crsqe->resp = NULL; + + crsqe->is_in_used = false; + + hwq->cons += req_size; + + spin_unlock(&hwq->lock); + } + *num_wait += wait_cmds; + return rc; +} + +/* function events */ +static int bng_re_process_func_event(struct bng_re_rcfw *rcfw, + struct creq_func_event *func_event) +{ + switch (func_event->event) { + case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_CQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_TQM_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR: + case CREQ_FUNC_EVENT_EVENT_TIM_ERROR: + case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST: + case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED: + break; + default: + return -EINVAL; + } + + return 0; +} + + + +/* CREQ Completion handlers */ +static void bng_re_service_creq(struct tasklet_struct *t) +{ + struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet); + struct bng_re_creq_ctx *creq = &rcfw->creq; + u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET; + struct bng_re_hwq *hwq = &creq->hwq; + struct creq_base *creqe; + u32 num_wakeup = 0; + u32 hw_polled = 0; + + /* Service the CREQ until budget is over */ + spin_lock_bh(&hwq->lock); + while (budget > 0) { + creqe = bng_re_get_qe(hwq, hwq->cons, NULL); + if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags)) + break; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); + + type = creqe->type & CREQ_BASE_TYPE_MASK; + switch (type) { + case CREQ_BASE_TYPE_QP_EVENT: + bng_re_process_qp_event + (rcfw, (struct creq_qp_event *)creqe, + &num_wakeup); + creq->stats.creq_qp_event_processed++; + break; + case CREQ_BASE_TYPE_FUNC_EVENT: + if (!bng_re_process_func_event + (rcfw, (struct creq_func_event *)creqe)) + creq->stats.creq_func_event_processed++; + else + dev_warn(&rcfw->pdev->dev, + "aeqe:%#x Not handled\n", type); + break; + default: + if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT) + dev_warn(&rcfw->pdev->dev, + "creqe with event 0x%x not handled\n", + type); + break; + } + budget--; + hw_polled++; + bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons, + 1, &creq->creq_db.dbinfo.flags); + } + + if (hw_polled) + bng_re_ring_nq_db(&creq->creq_db.dbinfo, + rcfw->res->cctx, true); + spin_unlock_bh(&hwq->lock); + if (num_wakeup) + wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); +} + +static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_mbox *mbox; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + mbox = &rcfw->cmdq.cmdq_mbox; + + mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id); + if (!mbox->reg.bar_base) { + dev_err(&pdev->dev, + "CMDQ BAR region %d resc start is 0!\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len); + if (!mbox->reg.bar_reg) { + dev_err(&pdev->dev, + "CMDQ BAR region %d mapping failed\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + mbox->prod = (void __iomem *)(mbox->reg.bar_reg + + BNG_FW_PF_VF_COMM_PROD_OFFSET); + mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET); + return 0; +} + +static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance) +{ + struct bng_re_rcfw *rcfw = dev_instance; + struct bng_re_creq_ctx *creq; + struct bng_re_hwq *hwq; + u32 sw_cons; + + creq = &rcfw->creq; + hwq = &creq->hwq; + /* Prefetch the CREQ element */ + sw_cons = HWQ_CMP(hwq->cons, hwq); + prefetch(bng_re_get_qe(hwq, sw_cons, NULL)); + + tasklet_schedule(&creq->creq_tasklet); + + return IRQ_HANDLED; +} + +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_res *res; + int rc; + + creq = &rcfw->creq; + res = rcfw->res; + + if (creq->irq_handler_avail) + return -EFAULT; + + creq->msix_vec = msix_vector; + if (need_init) + tasklet_setup(&creq->creq_tasklet, bng_re_service_creq); + else + tasklet_enable(&creq->creq_tasklet); + + creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s", + pci_name(res->pdev)); + if (!creq->irq_name) + return -ENOMEM; + rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0, + creq->irq_name, rcfw); + if (rc) { + kfree(creq->irq_name); + creq->irq_name = NULL; + tasklet_disable(&creq->creq_tasklet); + return rc; + } + creq->irq_handler_avail = true; + + bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); + atomic_inc(&rcfw->rcfw_intr_enabled); + + return 0; +} + +static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt) +{ + struct bng_re_creq_db *creq_db; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + creq_db = &rcfw->creq.creq_db; + + creq_db->dbinfo.flags = 0; + creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION; + creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id); + if (!creq_db->reg.bar_id) + dev_err(&pdev->dev, + "CREQ BAR region %d resc start is 0!", + creq_db->reg.bar_id); + + bar_reg = creq_db->reg.bar_base + reg_offt; + + creq_db->reg.len = BNG_FW_CREQ_DB_LEN; + creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len); + if (!creq_db->reg.bar_reg) { + dev_err(&pdev->dev, + "CREQ BAR region %d mapping failed", + creq_db->reg.bar_id); + return -ENOMEM; + } + creq_db->dbinfo.db = creq_db->reg.bar_reg; + creq_db->dbinfo.hwq = &rcfw->creq.hwq; + creq_db->dbinfo.xid = rcfw->creq.ring_id; + return 0; +} + +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill) +{ + struct bng_re_creq_ctx *creq; + + creq = &rcfw->creq; + + if (!creq->irq_handler_avail) + return; + + creq->irq_handler_avail = false; + /* Mask h/w interrupts */ + bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false); + /* Sync with last running IRQ-handler */ + synchronize_irq(creq->msix_vec); + free_irq(creq->msix_vec, rcfw); + kfree(creq->irq_name); + creq->irq_name = NULL; + atomic_set(&rcfw->rcfw_intr_enabled, 0); + if (kill) + tasklet_kill(&creq->creq_tasklet); + tasklet_disable(&creq->creq_tasklet); +} + +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_ctx *cmdq; + + creq = &rcfw->creq; + cmdq = &rcfw->cmdq; + /* Make sure the HW channel is stopped! */ + bng_re_rcfw_stop_irq(rcfw, true); + + iounmap(cmdq->cmdq_mbox.reg.bar_reg); + iounmap(creq->creq_db.reg.bar_reg); + + cmdq->cmdq_mbox.reg.bar_reg = NULL; + creq->creq_db.reg.bar_reg = NULL; + creq->msix_vec = 0; +} + +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off) +{ + struct bng_re_cmdq_ctx *cmdq; + int rc; + + cmdq = &rcfw->cmdq; + + /* Assign defaults */ + cmdq->seq_num = 0; + set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + init_waitqueue_head(&cmdq->waitq); + + rc = bng_re_map_cmdq_mbox(rcfw); + if (rc) + return rc; + + rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off); + if (rc) + return rc; + + rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true); + if (rc) { + dev_err(&rcfw->pdev->dev, + "Failed to request IRQ for CREQ rc = 0x%x\n", rc); + bng_re_disable_rcfw_channel(rcfw); + return rc; + } + + return 0; +} diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h index 351f73baa9df..d1773832b592 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.h +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -4,9 +4,26 @@ #ifndef __BNG_FW_H__ #define __BNG_FW_H__ +#include "bng_tlv.h" + +/* FW DB related */ +#define BNG_FW_CMDQ_TRIG_VAL 1 +#define BNG_FW_COMM_PCI_BAR_REGION 0 +#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2 +#define BNG_FW_COMM_SIZE 0x104 +#define BNG_FW_COMM_BASE_OFFSET 0x600 +#define BNG_FW_COMM_TRIG_OFFSET 0x100 +#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc +#define BNG_FW_CREQ_DB_LEN 8 + /* CREQ */ -#define BNG_FW_CREQE_MAX_CNT (64 * 1024) -#define BNG_FW_CREQE_UNITS 16 +#define BNG_FW_CREQE_MAX_CNT (64 * 1024) +#define BNG_FW_CREQE_UNITS 16 +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 +#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \ + (!!((hdr)->v & CREQ_BASE_V) == \ + !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK)) +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 /* CMDQ */ struct bng_fw_cmdqe { @@ -17,6 +34,15 @@ struct bng_fw_cmdqe { #define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe) #define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS) +#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1) +#define BNG_FW_CMD_IS_BLOCKING 0x8000 + +/* Crsq buf is 1024-Byte */ +struct bng_re_crsbe { + u8 data[1024]; +}; + + static inline u32 bng_fw_cmdqe_npages(u32 depth) { u32 npages; @@ -31,14 +57,43 @@ static inline u32 bng_fw_cmdqe_page_size(u32 depth) { return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE); } +struct bng_re_cmdq_mbox { + struct bng_re_reg_desc reg; + void __iomem *prod; + void __iomem *db; +}; /* HWQ */ struct bng_re_cmdq_ctx { struct bng_re_hwq hwq; + struct bng_re_cmdq_mbox cmdq_mbox; + unsigned long flags; +#define FIRMWARE_INITIALIZED_FLAG (0) +#define FIRMWARE_STALL_DETECTED (3) +#define FIRMWARE_FIRST_FLAG (31) + wait_queue_head_t waitq; + u32 seq_num; +}; + +struct bng_re_creq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_creq_stat { + u64 creq_qp_event_processed; + u64 creq_func_event_processed; }; struct bng_re_creq_ctx { struct bng_re_hwq hwq; + struct bng_re_creq_db creq_db; + struct bng_re_creq_stat stats; + struct tasklet_struct creq_tasklet; + u16 ring_id; + int msix_vec; + bool irq_handler_avail; + char *irq_name; }; struct bng_re_crsqe { @@ -47,6 +102,14 @@ struct bng_re_crsqe { /* Free slots at the time of submission */ u32 free_slots; u8 opcode; + bool is_waiter_alive; + bool is_in_used; +}; + +struct bng_re_rcfw_sbuf { + void *sb; + dma_addr_t dma_addr; + u32 size; }; /* RoCE FW Communication Channels */ @@ -61,9 +124,75 @@ struct bng_re_rcfw { u32 cmdq_depth; /* cached from chip cctx for quick reference in slow path */ u16 max_timeout; + atomic_t rcfw_intr_enabled; }; +struct bng_re_cmdqmsg { + struct cmdq_base *req; + struct creq_base *resp; + void *sb; + u32 req_sz; + u32 res_sz; + u8 block; +}; + +static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg, + void *req, void *resp, void *sb, + u32 req_sz, u32 res_sz, u8 block) +{ + msg->req = req; + msg->resp = resp; + msg->sb = sb; + msg->req_sz = req_sz; + msg->res_sz = res_sz; + msg->block = block; +} + +/* Get the number of command units required for the req. The + * function returns correct value only if called before + * setting using bng_re_set_cmd_slots + */ +static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_units = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_units = tlv_req->total_size; + } else { + cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_units; +} + +static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_byte = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS; + } else { + cmd_byte = req->cmd_size; + req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_byte; +} + void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw); int bng_re_alloc_fw_channel(struct bng_re_res *res, struct bng_re_rcfw *rcfw); +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off); +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init); +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill); #endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index 170434a75dea..fe003f4705a1 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -4,6 +4,8 @@ #ifndef __BNG_RE_H__ #define __BNG_RE_H__ +#include "bng_res.h" + #define BNG_RE_ADEV_NAME "bng_en" #define BNG_RE_DESC "Broadcom 800G RoCE Driver" @@ -11,12 +13,54 @@ #define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL) #define BNG_RE_MIN_MSIX 2 +#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX + +#define BNG_RE_CREQ_NQ_IDX 0 +/* NQ specific structures */ +struct bng_re_nq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_nq { + struct pci_dev *pdev; + struct bng_re_res *res; + char *name; + struct bng_re_hwq hwq; + struct bng_re_nq_db nq_db; + u16 ring_id; + int msix_vec; + cpumask_t mask; + struct tasklet_struct nq_tasklet; + bool requested; + int budget; + u32 load; + + struct workqueue_struct *cqn_wq; +}; + +struct bng_re_nq_record { + struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX]; + struct bng_re_nq nq[BNG_RE_MAX_MSIX]; + int num_msix; + /* serialize NQ access */ + struct mutex load_lock; +}; struct bng_re_en_dev_info { struct bng_re_dev *rdev; struct bnge_auxr_dev *auxr_dev; }; +struct bng_re_ring_attr { + dma_addr_t *dma_arr; + int pages; + int type; + u32 depth; + u32 lrid; /* Logical ring id */ + u8 mode; +}; + struct bng_re_dev { struct ib_device ibdev; unsigned long flags; @@ -28,6 +72,7 @@ struct bng_re_dev { int fn_id; struct bng_re_res bng_res; struct bng_re_rcfw rcfw; + struct bng_re_nq_record *nqr; }; #endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c index 2119d1f39b65..cb42c0fd2cdf 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.c +++ b/drivers/infiniband/hw/bng_re/bng_res.c @@ -5,6 +5,7 @@ #include #include +#include #include "bng_res.h" #include "roce_hsi.h" @@ -235,6 +236,7 @@ int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, hwq->depth = hwq_attr->depth; hwq->max_elements = hwq->depth; hwq->element_size = stride; + hwq->qe_ppg = pg_size / stride; /* For direct access to the elements */ lvl = hwq->level; if (hwq_attr->sginfo->nopte && hwq->level) diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h index e6123abadfad..f40f3477125f 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.h +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -4,6 +4,8 @@ #ifndef __BNG_RES_H__ #define __BNG_RES_H__ +#include "roce_hsi.h" + #define BNG_ROCE_FW_MAX_TIMEOUT 60 #define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) @@ -11,6 +13,12 @@ #define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) #define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG) +#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1)) +#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \ + ((HWQ_CMP(hwq->prod, hwq)\ + - HWQ_CMP(hwq->cons, hwq))\ + & (hwq->max_elements - 1))) + #define MAX_PBL_LVL_0_PGS 1 #define MAX_PBL_LVL_1_PGS 512 #define MAX_PBL_LVL_1_PGS_SHIFT 9 @@ -18,6 +26,41 @@ #define MAX_PBL_LVL_2_PGS (256 * 512) #define MAX_PDL_LVL_SHIFT 9 +#define BNG_RE_DBR_VALID (0x1UL << 26) +#define BNG_RE_DBR_EPOCH_SHIFT 24 +#define BNG_RE_DBR_TOGGLE_SHIFT 25 + + +struct bng_re_reg_desc { + u8 bar_id; + resource_size_t bar_base; + unsigned long offset; + void __iomem *bar_reg; + size_t len; +}; + +struct bng_re_db_info { + void __iomem *db; + void __iomem *priv_db; + struct bng_re_hwq *hwq; + u32 xid; + u32 max_slot; + u32 flags; + u8 toggle; +}; + +enum bng_re_db_info_flags_mask { + BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL, + BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL, + BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL, + BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL, +}; + +enum bng_re_db_epoch_flag_shift { + BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT, + BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1), +}; + struct bng_re_chip_ctx { u16 chip_num; u16 hw_stats_size; @@ -77,6 +120,8 @@ struct bng_re_hwq { u16 element_size; u32 prod; u32 cons; + /* queue entry per page */ + u16 qe_ppg; }; struct bng_re_res { @@ -84,6 +129,63 @@ struct bng_re_res { struct bng_re_chip_ctx *cctx; }; +static inline void *bng_re_get_qe(struct bng_re_hwq *hwq, + u32 indx, u64 *pg) +{ + u32 pg_num, pg_idx; + + pg_num = (indx / hwq->qe_ppg); + pg_idx = (indx % hwq->qe_ppg); + if (pg) + *pg = (u64)&hwq->pbl_ptr[pg_num]; + return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx); +} + +#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \ + (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \ + (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \ + (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT))) + +static inline void bng_re_ring_db(struct bng_re_db_info *info, + u32 type) +{ + u64 key = 0; + u32 indx; + u8 toggle = 0; + + if (type == DBC_DBC_TYPE_CQ_ARMALL || + type == DBC_DBC_TYPE_CQ_ARMSE) + toggle = info->toggle; + + indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) | + ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) << + BNG_RE_DB_EPOCH_CONS_SHIFT); + + key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle); + writeq(key, info->db); +} + +static inline void bng_re_ring_nq_db(struct bng_re_db_info *info, + struct bng_re_chip_ctx *cctx, + bool arm) +{ + u32 type; + + type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; + bng_re_ring_db(info, type); +} + +static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt, + u32 *dbinfo_flags) +{ + /* move cons and update toggle/epoch if wrap around */ + *cons += cnt; + if (*cons >= max_elements) { + *cons %= max_elements; + *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT; + } +} + void bng_re_free_hwq(struct bng_re_res *res, struct bng_re_hwq *hwq); diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h new file mode 100644 index 000000000000..278f4922962d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_tlv.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +#ifndef __BNG_TLV_H__ +#define __BNG_TLV_H__ + +#include "roce_hsi.h" + +struct roce_tlv { + struct tlv tlv; + u8 total_size; // in units of 16 byte chunks + u8 unused[7]; // for 16 byte alignment +}; + +/* + * TLV size in units of 16 byte chunks + */ +#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16) +/* + * TLV length in bytes + */ +#define TLV_BYTES (TLV_SIZE * 16) + +#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP) +#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES]) + +static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode; + else + return req->opcode; +} + +static inline void __set_cmdq_base_opcode(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val; + else + req->opcode = val; +} + +static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie; + else + return req->cookie; +} + +static inline void __set_cmdq_base_cookie(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val; + else + req->cookie = val; +} + +static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr; + else + return req->resp_addr; +} + +static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req, + u32 size, __le64 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val; + else + req->resp_addr = val; +} + +static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size; + else + return req->resp_size; +} + +static inline void __set_cmdq_base_resp_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val; + else + req->resp_size = val; +} + +static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct roce_tlv *)(req))->total_size; + else + return req->cmd_size; +} + +static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val; + else + req->cmd_size = val; +} + +static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->flags; + else + return req->flags; +} + +static inline void __set_cmdq_base_flags(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val; + else + req->flags = val; +} + +#endif /* __BNG_TLV_H__ */ From 53c6ee7d7f68a0679f8ddc703338aa2550d24a17 Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:24 +0000 Subject: [PATCH 37/65] RDMA/bng_re: Enable Firmware channel and query device attributes Enable Firmware channel and query device attributes Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-7-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/Makefile | 2 +- drivers/infiniband/hw/bng_re/bng_dev.c | 18 ++ drivers/infiniband/hw/bng_re/bng_fw.c | 279 ++++++++++++++++++++++++- drivers/infiniband/hw/bng_re/bng_fw.h | 9 + drivers/infiniband/hw/bng_re/bng_re.h | 2 + drivers/infiniband/hw/bng_re/bng_res.h | 7 + drivers/infiniband/hw/bng_re/bng_sp.c | 131 ++++++++++++ drivers/infiniband/hw/bng_re/bng_sp.h | 47 +++++ 8 files changed, 491 insertions(+), 4 deletions(-) create mode 100644 drivers/infiniband/hw/bng_re/bng_sp.c create mode 100644 drivers/infiniband/hw/bng_re/bng_sp.h diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile index 1b957defbabc..556b763b43f9 100644 --- a/drivers/infiniband/hw/bng_re/Makefile +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -4,4 +4,4 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/driv obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o bng_re-y := bng_dev.o bng_fw.o \ - bng_res.o + bng_res.o bng_sp.o diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index 12f86a0e7985..dceb973be7cb 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -8,6 +8,7 @@ #include #include "bng_res.h" +#include "bng_sp.h" #include "bng_fw.h" #include "bnge.h" #include "bnge_auxr.h" @@ -55,6 +56,9 @@ static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) if (!rdev->chip_ctx) return; + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + chip_ctx = rdev->chip_ctx; rdev->chip_ctx = NULL; rdev->rcfw.res = NULL; @@ -67,6 +71,7 @@ static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) { struct bng_re_chip_ctx *chip_ctx; struct bnge_auxr_dev *aux_dev; + int rc = -ENOMEM; aux_dev = rdev->aux_dev; rdev->bng_res.pdev = aux_dev->pdev; @@ -79,8 +84,16 @@ static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) rdev->chip_ctx = chip_ctx; rdev->bng_res.cctx = rdev->chip_ctx; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->bng_res.dattr = rdev->dev_attr; return 0; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; } static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd) @@ -298,7 +311,12 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) goto free_ring; } + rc = bng_re_get_dev_attr(&rdev->rcfw); + if (rc) + goto disable_rcfw; return 0; +disable_rcfw: + bng_re_disable_rcfw_channel(&rdev->rcfw); free_ring: bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); free_rcfw: diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c index 1bfed58e6703..f16fd21dfbce 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.c +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -6,6 +6,49 @@ #include "bng_res.h" #include "bng_fw.h" +/** + * bng_re_map_rc - map return type based on opcode + * @opcode: roce slow path opcode + * + * case #1 + * Firmware initiated error recovery is a safe state machine and + * driver can consider all the underlying rdma resources are free. + * In this state, it is safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * case #2 + * If driver detect potential firmware stall, it is not safe state machine + * and the driver can not consider all the underlying rdma resources are + * freed. + * In this state, it is not safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * Scope of this helper function is only for case #1. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bng_re_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw) { kfree(rcfw->crsqe_tbl); @@ -168,8 +211,6 @@ static int bng_re_process_func_event(struct bng_re_rcfw *rcfw, return 0; } - - /* CREQ Completion handlers */ static void bng_re_service_creq(struct tasklet_struct *t) { @@ -229,6 +270,214 @@ static void bng_re_service_creq(struct tasklet_struct *t) wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); } +static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, + u8 opcode) +{ + struct bng_re_cmdq_ctx *cmdq; + + cmdq = &rcfw->cmdq; + + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { + dev_err(&rcfw->pdev->dev, "RCFW already initialized!"); + return -EINVAL; + } + + if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { + dev_err(&rcfw->pdev->dev, + "RCFW not initialized, reject opcode 0x%x", + opcode); + return -EOPNOTSUPP; + } + + return 0; +} + +static int __send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, u8 opcode) +{ + u32 bsize, free_slots, required_slots; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + struct bng_fw_cmdqe *cmdqe; + struct bng_re_hwq *hwq; + u32 sw_prod, cmdq_prod; + struct pci_dev *pdev; + u16 cookie; + u8 *preq; + + cmdq = &rcfw->cmdq; + hwq = &cmdq->hwq; + pdev = rcfw->pdev; + + /* Cmdq are in 16-byte units, each request can consume 1 or more + * cmdqe + */ + spin_lock_bh(&hwq->lock); + required_slots = bng_re_get_cmd_slots(msg->req); + free_slots = HWQ_FREE_SLOTS(hwq); + cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (required_slots >= free_slots) { + dev_info_ratelimited(&pdev->dev, + "CMDQ is full req/free %d/%d!", + required_slots, free_slots); + spin_unlock_bh(&hwq->lock); + return -EAGAIN; + } + __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); + + bsize = bng_re_set_cmd_slots(msg->req); + crsqe->free_slots = free_slots; + crsqe->resp = (struct creq_qp_event *)msg->resp; + crsqe->is_waiter_alive = true; + crsqe->is_in_used = true; + crsqe->opcode = opcode; + + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); + if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { + struct bng_re_rcfw_sbuf *sbuf = msg->sb; + + __set_cmdq_base_resp_addr(msg->req, msg->req_sz, + cpu_to_le64(sbuf->dma_addr)); + __set_cmdq_base_resp_size(msg->req, msg->req_sz, + ALIGN(sbuf->size, + BNG_FW_CMDQE_UNITS) / + BNG_FW_CMDQE_UNITS); + } + + preq = (u8 *)msg->req; + do { + /* Locate the next cmdq slot */ + sw_prod = HWQ_CMP(hwq->prod, hwq); + cmdqe = bng_re_get_qe(hwq, sw_prod, NULL); + /* Copy a segment of the req cmd to the cmdq */ + memset(cmdqe, 0, sizeof(*cmdqe)); + memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); + preq += min_t(u32, bsize, sizeof(*cmdqe)); + bsize -= min_t(u32, bsize, sizeof(*cmdqe)); + hwq->prod++; + } while (bsize > 0); + cmdq->seq_num++; + + cmdq_prod = hwq->prod & 0xFFFF; + if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) { + /* The very first doorbell write + * is required to set this flag + * which prompts the FW to reset + * its internal pointers + */ + cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG); + clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + } + /* ring CMDQ DB */ + wmb(); + writel(cmdq_prod, cmdq->cmdq_mbox.prod); + writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); + spin_unlock_bh(&hwq->lock); + /* Return the CREQ response pointer */ + return 0; +} + +/** + * __wait_for_resp - Don't hold the cpu context and wait for response + * @rcfw: rcfw channel instance of rdev + * @cookie: cookie to track the command + * + * Wait for command completion in sleepable context. + * + * Returns: + * 0 if command is completed by firmware. + * Non zero error code for rest of the case. + */ +static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + + cmdq = &rcfw->cmdq; + crsqe = &rcfw->crsqe_tbl[cookie]; + + do { + wait_event_timeout(cmdq->waitq, + !crsqe->is_in_used, + secs_to_jiffies(rcfw->max_timeout)); + + if (!crsqe->is_in_used) + return 0; + + bng_re_service_creq(&rcfw->creq.creq_tasklet); + + if (!crsqe->is_in_used) + return 0; + } while (true); +}; + +/** + * bng_re_rcfw_send_message - interface to send + * and complete rcfw command. + * @rcfw: rcfw channel instance of rdev + * @msg: message to send + * + * This function does not account shadow queue depth. It will send + * all the command unconditionally as long as send queue is not full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg) +{ + struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; + struct bng_re_crsqe *crsqe; + u16 cookie; + int rc; + u8 opcode; + + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); + + rc = __send_message_basic_sanity(rcfw, msg, opcode); + if (rc) + return rc == -ENXIO ? bng_re_map_rc(opcode) : rc; + + rc = __send_message(rcfw, msg, opcode); + if (rc) + return rc; + + cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) + & BNG_FW_MAX_COOKIE_VALUE; + + rc = __wait_for_resp(rcfw, cookie); + + if (rc) { + spin_lock_bh(&rcfw->cmdq.hwq.lock); + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); + spin_unlock_bh(&rcfw->cmdq.hwq.lock); + return -ETIMEDOUT; + } + + if (evnt->status) { + /* failed with status */ + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", + cookie, opcode, evnt->status); + rc = -EIO; + } + + return rc; +} + static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw) { struct bng_re_cmdq_mbox *mbox; @@ -278,7 +527,6 @@ static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance) prefetch(bng_re_get_qe(hwq, sw_cons, NULL)); tasklet_schedule(&creq->creq_tasklet); - return IRQ_HANDLED; } @@ -395,6 +643,30 @@ void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw) creq->msix_vec = 0; } +static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_mbox *mbox; + struct cmdq_init init = {0}; + + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + mbox = &cmdq->cmdq_mbox; + + init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]); + init.cmdq_size_cmdq_lvl = + cpu_to_le16(((rcfw->cmdq_depth << + CMDQ_INIT_CMDQ_SIZE_SFT) & + CMDQ_INIT_CMDQ_SIZE_MASK) | + ((cmdq->hwq.level << + CMDQ_INIT_CMDQ_LVL_SFT) & + CMDQ_INIT_CMDQ_LVL_MASK)); + init.creq_ring_id = cpu_to_le16(creq->ring_id); + /* Write to the mailbox register */ + __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4); +} + int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, int msix_vector, int cp_bar_reg_off) @@ -425,5 +697,6 @@ int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, return rc; } + bng_re_start_rcfw(rcfw); return 0; } diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h index d1773832b592..88476d6c1d07 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.h +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -136,6 +136,13 @@ struct bng_re_cmdqmsg { u8 block; }; +static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req, + u8 opcode, u8 cmd_size) +{ + req->opcode = opcode; + req->cmd_size = cmd_size; +} + static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg, void *req, void *resp, void *sb, u32 req_sz, u32 res_sz, u8 block) @@ -195,4 +202,6 @@ void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw); int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, bool need_init); void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill); +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg); #endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index fe003f4705a1..48968beed46e 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -73,6 +73,8 @@ struct bng_re_dev { struct bng_re_res bng_res; struct bng_re_rcfw rcfw; struct bng_re_nq_record *nqr; + /* Device Resources */ + struct bng_re_dev_attr *dev_attr; }; #endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h index f40f3477125f..7315db347aa6 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.h +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -30,6 +30,7 @@ #define BNG_RE_DBR_EPOCH_SHIFT 24 #define BNG_RE_DBR_TOGGLE_SHIFT 25 +#define BNG_MAX_TQM_ALLOC_REQ 48 struct bng_re_reg_desc { u8 bar_id; @@ -127,6 +128,7 @@ struct bng_re_hwq { struct bng_re_res { struct pci_dev *pdev; struct bng_re_chip_ctx *cctx; + struct bng_re_dev_attr *dattr; }; static inline void *bng_re_get_qe(struct bng_re_hwq *hwq, @@ -186,6 +188,11 @@ static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt, } } +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + void bng_re_free_hwq(struct bng_re_res *res, struct bng_re_hwq *hwq); diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c new file mode 100644 index 000000000000..83099e05328d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include +#include + +#include "bng_res.h" +#include "bng_fw.h" +#include "bng_sp.h" +#include "bng_tlv.h" + +static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw) +{ + u16 pcie_ctl2 = 0; + + pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2); + return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); +} + +static void bng_re_query_version(struct bng_re_rcfw *rcfw, + char *fw_ver) +{ + struct creq_query_version_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct cmdq_query_version req = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_VERSION, + sizeof(req)); + + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return; + fw_ver[0] = resp.fw_maj; + fw_ver[1] = resp.fw_minor; + fw_ver[2] = resp.fw_bld; + fw_ver[3] = resp.fw_rsvd; +} + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw) +{ + struct bng_re_dev_attr *attr = rcfw->res->dattr; + struct creq_query_func_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct creq_query_func_resp_sb *sb; + struct bng_re_rcfw_sbuf sbuf; + struct cmdq_query_func req = {}; + u8 *tqm_alloc; + int i, rc; + u32 temp; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_FUNC, + sizeof(req)); + + sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + sb = sbuf.sb; + req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS; + bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + goto bail; + /* Extract the context from the side buffer */ + attr->max_qp = le32_to_cpu(sb->max_qp); + /* max_qp value reported by FW doesn't include the QP1 */ + attr->max_qp += 1; + attr->max_qp_rd_atom = + sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom; + attr->max_qp_init_rd_atom = + sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; + + /* Adjust for max_qp_wqes for variable wqe */ + attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1); + + attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE); + attr->max_cq = le32_to_cpu(sb->max_cq); + attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + attr->max_cq_sges = attr->max_qp_sges; + attr->max_mr = le32_to_cpu(sb->max_mr); + attr->max_mw = le32_to_cpu(sb->max_mw); + + attr->max_mr_size = le64_to_cpu(sb->max_mr_size); + attr->max_pd = 64 * 1024; + attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp); + attr->max_ah = le32_to_cpu(sb->max_ah); + + attr->max_srq = le16_to_cpu(sb->max_srq); + attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; + attr->max_srq_sges = sb->max_srq_sge; + attr->max_pkey = 1; + attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); + attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); + + bng_re_query_version(rcfw, attr->fw_ver); + for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) { + temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); + tqm_alloc = (u8 *)&temp; + attr->tqm_alloc_reqs[i * 4] = *tqm_alloc; + attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); + } + + attr->max_dpi = le32_to_cpu(sb->max_dpi); + attr->is_atomic = bng_re_is_atomic_cap(rcfw); +bail: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, + sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h new file mode 100644 index 000000000000..e15190515ed1 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_SP_H__ +#define __BNG_SP_H__ + +#include "bng_fw.h" + +#define BNG_VAR_MAX_WQE 4352 +#define BNG_VAR_MAX_SGE 13 + +struct bng_re_dev_attr { +#define FW_VER_ARR_LEN 4 + u8 fw_ver[FW_VER_ARR_LEN]; +#define BNG_RE_NUM_GIDS_SUPPORTED 256 + u16 max_sgid; + u16 max_mrw; + u32 max_qp; +#define BNG_RE_MAX_OUT_RD_ATOM 126 + u32 max_qp_rd_atom; + u32 max_qp_init_rd_atom; + u32 max_qp_wqes; + u32 max_qp_sges; + u32 max_cq; + u32 max_cq_wqes; + u32 max_cq_sges; + u32 max_mr; + u64 max_mr_size; + u32 max_pd; + u32 max_mw; + u32 max_raw_ethy_qp; + u32 max_ah; + u32 max_srq; + u32 max_srq_wqes; + u32 max_srq_sges; + u32 max_pkey; + u32 max_inline_data; + u32 l2_db_size; + u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ]; + bool is_atomic; + u16 dev_cap_flags; + u16 dev_cap_flags2; + u32 max_dpi; +}; + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw); +#endif From 99e4e102833765483ae12027719be09c472f0ca9 Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:25 +0000 Subject: [PATCH 38/65] RDMA/bng_re: Add basic debugfs infrastructure Add basic debugfs infrastructure for Broadcom next generation controller. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-8-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/Makefile | 3 +- drivers/infiniband/hw/bng_re/bng_debugfs.c | 39 ++++++++++++++++++++++ drivers/infiniband/hw/bng_re/bng_debugfs.h | 12 +++++++ drivers/infiniband/hw/bng_re/bng_dev.c | 12 +++++++ drivers/infiniband/hw/bng_re/bng_re.h | 1 + 5 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/bng_re/bng_debugfs.c create mode 100644 drivers/infiniband/hw/bng_re/bng_debugfs.h diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile index 556b763b43f9..c6aaaf853c77 100644 --- a/drivers/infiniband/hw/bng_re/Makefile +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -4,4 +4,5 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/driv obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o bng_re-y := bng_dev.o bng_fw.o \ - bng_res.o bng_sp.o + bng_res.o bng_sp.o \ + bng_debugfs.o diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c new file mode 100644 index 000000000000..9ec5a8785250 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include +#include + +#include + +#include "bng_res.h" +#include "bng_fw.h" +#include "bnge.h" +#include "bnge_auxr.h" +#include "bng_re.h" +#include "bng_debugfs.h" + +static struct dentry *bng_re_debugfs_root; + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->aux_dev->pdev; + + rdev->dbg_root = + debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root); +} + +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bng_re_register_debugfs(void) +{ + bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL); +} + +void bng_re_unregister_debugfs(void) +{ + debugfs_remove(bng_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h new file mode 100644 index 000000000000..baef71df4242 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_DEBUGFS__ +#define __BNG_RE_DEBUGFS__ + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev); +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev); + +void bng_re_register_debugfs(void); +void bng_re_unregister_debugfs(void); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index dceb973be7cb..9f5efe59d105 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -14,6 +14,7 @@ #include "bnge_auxr.h" #include "bng_re.h" #include "bnge_hwrm.h" +#include "bng_debugfs.h" MODULE_AUTHOR("Siva Reddy Kallam "); MODULE_DESCRIPTION(BNG_RE_DESC); @@ -215,6 +216,7 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) static void bng_re_dev_uninit(struct bng_re_dev *rdev) { + bng_re_debugfs_rem_pdev(rdev); bng_re_disable_rcfw_channel(&rdev->rcfw); bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, RING_ALLOC_REQ_RING_TYPE_NQ); @@ -314,6 +316,9 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) rc = bng_re_get_dev_attr(&rdev->rcfw); if (rc) goto disable_rcfw; + + bng_re_debugfs_add_pdev(rdev); + return 0; disable_rcfw: bng_re_disable_rcfw_channel(&rdev->rcfw); @@ -419,17 +424,24 @@ static int __init bng_re_mod_init(void) int rc; + bng_re_register_debugfs(); + rc = auxiliary_driver_register(&bng_re_driver); if (rc) { pr_err("%s: Failed to register auxiliary driver\n", KBUILD_MODNAME); + goto unreg_debugfs; } + return 0; +unreg_debugfs: + bng_re_unregister_debugfs(); return rc; } static void __exit bng_re_mod_exit(void) { auxiliary_driver_unregister(&bng_re_driver); + bng_re_unregister_debugfs(); } module_init(bng_re_mod_init); diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index 48968beed46e..2faeb9d3e6e3 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -75,6 +75,7 @@ struct bng_re_dev { struct bng_re_nq_record *nqr; /* Device Resources */ struct bng_re_dev_attr *dev_attr; + struct dentry *dbg_root; }; #endif From 04e031ff6e60fc2775f18f963ebfe00b2573d0fb Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Mon, 17 Nov 2025 17:11:26 +0000 Subject: [PATCH 39/65] RDMA/bng_re: Initialize the Firmware and Hardware Initialize the firmware and hardware with HWRM command. Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20251117171136.128193-9-siva.kallam@broadcom.com Reviewed-by: Usman Ansari Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_dev.c | 92 +++++++++++++++++++++++++- drivers/infiniband/hw/bng_re/bng_fw.c | 65 ++++++++++++++++++ drivers/infiniband/hw/bng_re/bng_fw.h | 4 ++ drivers/infiniband/hw/bng_re/bng_re.h | 4 ++ drivers/infiniband/hw/bng_re/bng_res.c | 27 ++++++++ drivers/infiniband/hw/bng_re/bng_res.h | 14 ++++ 6 files changed, 203 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index 9f5efe59d105..d8f8d7f7075f 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -178,6 +178,56 @@ static int bng_re_net_ring_alloc(struct bng_re_dev *rdev, return rc; } +static int bng_re_stats_ctx_free(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_stat_ctx_free_input req = {}; + struct hwrm_stat_ctx_free_output resp = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); + req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x", + rc); + + return rc; +} + +static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct bng_re_stats *stats = &rdev->stats_ctx; + struct hwrm_stat_ctx_alloc_output resp = {}; + struct hwrm_stat_ctx_alloc_input req = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + stats->fw_id = BNGE_INVALID_STATS_CTX_ID; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); + req.update_period_ms = cpu_to_le32(1000); + req.stats_dma_addr = cpu_to_le64(stats->dma_map); + req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size); + req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + stats->fw_id = le32_to_cpu(resp.stat_ctx_id); + return rc; +} + static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) { struct bnge_auxr_dev *aux_dev = rdev->aux_dev; @@ -216,11 +266,21 @@ static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) static void bng_re_dev_uninit(struct bng_re_dev *rdev) { + int rc; bng_re_debugfs_rem_pdev(rdev); - bng_re_disable_rcfw_channel(&rdev->rcfw); - bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, + + if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { + rc = bng_re_deinit_rcfw(&rdev->rcfw); + if (rc) + ibdev_warn(&rdev->ibdev, + "Failed to deinitialize RCFW: %#x", rc); + bng_re_stats_ctx_free(rdev); + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); + bng_re_disable_rcfw_channel(&rdev->rcfw); + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, RING_ALLOC_REQ_RING_TYPE_NQ); - bng_re_free_rcfw_channel(&rdev->rcfw); + bng_re_free_rcfw_channel(&rdev->rcfw); + } kfree(rdev->nqr); rdev->nqr = NULL; @@ -318,8 +378,34 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) goto disable_rcfw; bng_re_debugfs_add_pdev(rdev); + rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx, + &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate stats context: %#x\n", rc); + goto disable_rcfw; + } + + rc = bng_re_stats_ctx_alloc(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate QPLIB context: %#x\n", rc); + goto free_stats_ctx; + } + + rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to initialize RCFW: %#x\n", rc); + goto free_sctx; + } + set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags); return 0; +free_sctx: + bng_re_stats_ctx_free(rdev); +free_stats_ctx: + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); disable_rcfw: bng_re_disable_rcfw_channel(&rdev->rcfw); free_ring: diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c index f16fd21dfbce..803610fb9c58 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.c +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -5,6 +5,7 @@ #include "roce_hsi.h" #include "bng_res.h" #include "bng_fw.h" +#include "bng_sp.h" /** * bng_re_map_rc - map return type based on opcode @@ -700,3 +701,67 @@ int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, bng_re_start_rcfw(rcfw); return 0; } + +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw) +{ + struct creq_deinitialize_fw_resp resp = {}; + struct cmdq_deinitialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_DEINITIALIZE_FW, + sizeof(req)); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, + sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + + clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} +static inline bool _is_hw_retx_supported(u16 dev_cap_flags) +{ + return dev_cap_flags & + (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED | + CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED); +} + +#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & + CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx) +{ + struct creq_initialize_fw_resp resp = {}; + struct cmdq_initialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + u16 flags = 0; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_INITIALIZE_FW, + sizeof(req)); + /* Supply (log-base-2-of-host-page-size - base-page-shift) + * to bono to adjust the doorbell page sizes. + */ + req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT - + BNG_FW_DBR_BASE_PAGE_SHIFT); + if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + req.flags |= cpu_to_le16(flags); + req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h index 88476d6c1d07..c89c926ec2fc 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.h +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -10,6 +10,7 @@ #define BNG_FW_CMDQ_TRIG_VAL 1 #define BNG_FW_COMM_PCI_BAR_REGION 0 #define BNG_FW_COMM_CONS_PCI_BAR_REGION 2 +#define BNG_FW_DBR_BASE_PAGE_SHIFT 12 #define BNG_FW_COMM_SIZE 0x104 #define BNG_FW_COMM_BASE_OFFSET 0x600 #define BNG_FW_COMM_TRIG_OFFSET 0x100 @@ -204,4 +205,7 @@ int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill); int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, struct bng_re_cmdqmsg *msg); +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx); +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw); #endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h index 2faeb9d3e6e3..dae4862621a7 100644 --- a/drivers/infiniband/hw/bng_re/bng_re.h +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -16,6 +16,8 @@ #define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX #define BNG_RE_CREQ_NQ_IDX 0 + +#define BNGE_INVALID_STATS_CTX_ID -1 /* NQ specific structures */ struct bng_re_nq_db { struct bng_re_reg_desc reg; @@ -65,6 +67,7 @@ struct bng_re_dev { struct ib_device ibdev; unsigned long flags; #define BNG_RE_FLAG_NETDEV_REGISTERED 0 +#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1 struct net_device *netdev; struct auxiliary_device *adev; struct bnge_auxr_dev *aux_dev; @@ -76,6 +79,7 @@ struct bng_re_dev { /* Device Resources */ struct bng_re_dev_attr *dev_attr; struct dentry *dbg_root; + struct bng_re_stats stats_ctx; }; #endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c index cb42c0fd2cdf..c50823758b53 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.c +++ b/drivers/infiniband/hw/bng_re/bng_res.c @@ -9,6 +9,33 @@ #include "bng_res.h" #include "roce_hsi.h" +/* Stats */ +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats) +{ + if (stats->dma) { + dma_free_coherent(&pdev->dev, stats->size, + stats->dma, stats->dma_map); + } + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; +} + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; + stats->size = cctx->hw_stats_size; + stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, + &stats->dma_map, GFP_KERNEL); + if (!stats->dma) + return -ENOMEM; + + return 0; +} + static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl) { struct pci_dev *pdev = res->pdev; diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h index 7315db347aa6..9997f86d6a0e 100644 --- a/drivers/infiniband/hw/bng_re/bng_res.h +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -125,6 +125,13 @@ struct bng_re_hwq { u16 qe_ppg; }; +struct bng_re_stats { + dma_addr_t dma_map; + void *dma; + u32 size; + u32 fw_id; +}; + struct bng_re_res { struct pci_dev *pdev; struct bng_re_chip_ctx *cctx; @@ -198,4 +205,11 @@ void bng_re_free_hwq(struct bng_re_res *res, int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, struct bng_re_hwq_attr *hwq_attr); + +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats); + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats); #endif From cdb3a6f1833ae4ab729d07dceee59e0fba213c24 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:03 +0800 Subject: [PATCH 40/65] RDMA/hns: Add helpers to obtain netdev and bus_num from hr_dev Add helpers to obtain netdev and bus_num from hr_dev. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_ah.c | 1 - drivers/infiniband/hw/hns/hns_roce_device.h | 12 ++++++++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 19 ++++++++++--------- drivers/infiniband/hw/hns/hns_roce_pd.c | 1 - drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 1 - 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 307c35888b30..0c1c32d23c88 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include #include "hns_roce_device.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 78ee04a48a74..5ae37832059f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -33,6 +33,7 @@ #ifndef _HNS_ROCE_DEVICE_H #define _HNS_ROCE_DEVICE_H +#include #include #include #include "hns_roce_debugfs.h" @@ -1165,6 +1166,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh) grh->traffic_class >> DSCP_SHIFT : grh->traffic_class; } +static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev, + u8 port) +{ + return hr_dev->iboe.netdevs[port]; +} + +static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev) +{ + return hr_dev->pci_dev->bus->number; +} + void hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d50f36f8a110..8bca0b10c69e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -32,7 +32,6 @@ */ #include #include -#include #include #include #include @@ -148,12 +147,13 @@ static int hns_roce_netdev_event(struct notifier_block *self, static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev) { + struct net_device *net_dev; int ret; u8 i; for (i = 0; i < hr_dev->caps.num_ports; i++) { - ret = hns_roce_set_mac(hr_dev, i, - hr_dev->iboe.netdevs[i]->dev_addr); + net_dev = get_hr_netdev(hr_dev, i); + ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr); if (ret) return ret; } @@ -246,7 +246,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, spin_lock_irqsave(&hr_dev->iboe.lock, flags); - net_dev = hr_dev->iboe.netdevs[port]; + net_dev = get_hr_netdev(hr_dev, port); if (!net_dev) { spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); dev_err(dev, "find netdev %u failed!\n", port); @@ -704,11 +704,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = { static int hns_roce_register_device(struct hns_roce_dev *hr_dev) { - int ret; struct hns_roce_ib_iboe *iboe = NULL; - struct ib_device *ib_dev = NULL; struct device *dev = hr_dev->dev; + struct ib_device *ib_dev = NULL; + struct net_device *net_dev; unsigned int i; + int ret; iboe = &hr_dev->iboe; spin_lock_init(&iboe->lock); @@ -744,11 +745,11 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); for (i = 0; i < hr_dev->caps.num_ports; i++) { - if (!hr_dev->iboe.netdevs[i]) + net_dev = get_hr_netdev(hr_dev, i); + if (!net_dev) continue; - ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i], - i + 1); + ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); if (ret) return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index d35cf59d0f43..225c3e328e0e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include "hns_roce_device.h" void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 6ff1b8ce580c..e0e28c4ff1ca 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include @@ -1350,11 +1349,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_attr *attr, int attr_mask) { + struct net_device *net_dev; enum ib_mtu active_mtu; int p; p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port; - active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu); + net_dev = get_hr_netdev(hr_dev, p); + active_mtu = iboe_get_mtu(net_dev->mtu); if ((hr_dev->caps.max_mtu >= IB_MTU_2048 && attr->path_mtu > hr_dev->caps.max_mtu) || diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 1090051f493b..8a6efb6b9c9e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -3,7 +3,6 @@ * Copyright (c) 2018 Hisilicon Limited. */ -#include #include #include #include "hns_roce_device.h" From b37ad2e290fc52e575572b04803a4f93f584df6c Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:04 +0800 Subject: [PATCH 41/65] RDMA/hns: Initialize bonding resources Allocate bond_grp resources for each card when the first device in this card is registered. Block the initialization of VF when its PF is a bonded slave, as VF is not supported in this case due to HW constraints. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/Makefile | 4 +- drivers/infiniband/hw/hns/hns_roce_bond.c | 192 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 38 ++++ drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 + drivers/infiniband/hw/hns/hns_roce_main.c | 11 ++ 6 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.c create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.h diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index baf592e6f21b..d07ef02c5231 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,11 +4,13 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common ccflags-y += -I $(src) hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o hns_roce_hw_v2.o + hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c new file mode 100644 index 000000000000..918c1382fa65 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" + +static DEFINE_XARRAY(roce_bond_xa); + +static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) +{ + struct net_device *upper_dev; + + rcu_read_lock(); + upper_dev = netdev_master_upper_dev_get_rcu(net_dev); + dev_hold(upper_dev); + rcu_read_unlock(); + + return upper_dev; +} + +static int get_netdev_bond_slave_id(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) +{ + int i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) + if (net_dev == bond_grp->bond_func_info[i].net_dev) + return i; + + return -ENOENT; +} + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + struct hns_roce_bond_group *bond_grp; + struct net_device *upper_dev = NULL; + int i; + + if (!die_info) + return NULL; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return bond_grp; + if (bond_grp->upper_dev) { + upper_dev = get_upper_dev_from_ndev(net_dev); + if (bond_grp->upper_dev == upper_dev) { + dev_put(upper_dev); + return bond_grp; + } + dev_put(upper_dev); + } + } + + return NULL; +} + +static struct hns_roce_die_info *alloc_die_info(int bus_num) +{ + struct hns_roce_die_info *die_info; + int ret; + + die_info = kzalloc(sizeof(*die_info), GFP_KERNEL); + if (!die_info) + return NULL; + + ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL)); + if (ret) { + kfree(die_info); + return NULL; + } + + return die_info; +} + +static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num) +{ + xa_erase(&roce_bond_xa, bus_num); + kfree(die_info); +} + +static int alloc_bond_id(struct hns_roce_bond_group *bond_grp) +{ + u8 bus_num = bond_grp->bus_num; + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + int i; + + if (!die_info) { + die_info = alloc_die_info(bus_num); + if (!die_info) + return -ENOMEM; + } + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + if (die_info->bond_id_mask & BOND_ID(i)) + continue; + + die_info->bond_id_mask |= BOND_ID(i); + die_info->bgrps[i] = bond_grp; + bond_grp->bond_id = i; + + return 0; + } + + return -ENOSPC; +} + +static int remove_bond_id(int bus_num, u8 bond_id) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + + if (bond_id >= ROCE_BOND_NUM_MAX) + return -EINVAL; + + if (!die_info) + return -ENODEV; + + die_info->bond_id_mask &= ~BOND_ID(bond_id); + die_info->bgrps[bond_id] = NULL; + if (!die_info->bond_id_mask) + dealloc_die_info(die_info, bus_num); + + return 0; +} + +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + int ret; + int i; + + if (xa_load(&roce_bond_xa, bus_num)) + return 0; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL); + if (!bond_grp) { + ret = -ENOMEM; + goto mem_err; + } + + bond_grp->bus_num = bus_num; + + ret = alloc_bond_id(bond_grp); + if (ret) { + dev_err(hr_dev->dev, + "failed to alloc bond ID, ret = %d.\n", ret); + goto alloc_id_err; + } + + bgrps[i] = bond_grp; + } + + return 0; + +alloc_id_err: + kvfree(bond_grp); +mem_err: + for (i--; i >= 0; i--) { + remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id); + kvfree(bgrps[i]); + } + return ret; +} + +void hns_roce_dealloc_bond_grp(void) +{ + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + unsigned long id; + int i; + + xa_for_each(&roce_bond_xa, id, die_info) { + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); + kvfree(bond_grp); + } + } +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h new file mode 100644 index 000000000000..61c52135588e --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#ifndef _HNS_ROCE_BOND_H +#define _HNS_ROCE_BOND_H + +#include +#include + +#define ROCE_BOND_FUNC_MAX 4 +#define ROCE_BOND_NUM_MAX 2 + +#define BOND_ID(id) BIT(id) + +struct hns_roce_func_info { + struct net_device *net_dev; +}; + +struct hns_roce_bond_group { + struct net_device *upper_dev; + u8 bond_id; + u8 bus_num; + struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; +}; + +struct hns_roce_die_info { + u8 bond_id_mask; + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; +}; + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num); +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev); +void hns_roce_dealloc_bond_grp(void); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 5ae37832059f..cc1402fc8943 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -154,6 +154,7 @@ enum { HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), + HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f82bdd46a917..6750bad9bb53 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,11 +43,13 @@ #include #include +#include "hclge_main.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" #define CREATE_TRACE_POINTS #include "hns_roce_trace.h" @@ -2270,6 +2272,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev) caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) << HNS_ROCE_CAP_FLAGS_EX_SHIFT; + if (hr_dev->is_vf) + caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND; + caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS); caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID); caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH); @@ -7260,6 +7265,7 @@ static int __init hns_roce_hw_v2_init(void) static void __exit hns_roce_hw_v2_exit(void) { + hns_roce_dealloc_bond_grp(); hnae3_unregister_client(&hns_roce_hw_v2_client); hns_roce_cleanup_debugfs(); } diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8bca0b10c69e..7fa25586ccd8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -40,6 +40,7 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -744,6 +745,16 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + ret = hns_roce_alloc_bond_grp(hr_dev); + if (ret) { + dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n", + get_hr_bus_num(hr_dev), ret); + return ret; + } + } + for (i = 0; i < hr_dev->caps.num_ports; i++) { net_dev = get_hr_netdev(hr_dev, i); if (!net_dev) From d31d410b38e61f89b978d4e1364040537339f559 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:05 +0800 Subject: [PATCH 42/65] RDMA/hns: Add bonding event handler Register netdev notifier for two bonding events NETDEV_CHANGEUPPER and NETDEV_CHANGELOWERSTATE. In NETDEV_CHANGEUPPER event handler, check some rules about the HW constraints when trying to link a new slave to the masteri, and store some bonding information from the notifier. In unlinking case, simply check the number of the rest slaves to decide whether the bond is still supported. In NETDEV_CHANGELOWERSTATE event handler, not much is done. It simply sets the bond state when the bond is ready, which will be used later. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_bond.c | 314 ++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 26 ++ 2 files changed, 340 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index 918c1382fa65..ce3646ed3fbf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -9,6 +9,17 @@ static DEFINE_XARRAY(roce_bond_xa); +static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS); + + if (!ibdev) + return NULL; + + return container_of(ibdev, struct hns_roce_dev, ib_dev); +} + static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) { struct net_device *upper_dev; @@ -131,6 +142,291 @@ static int remove_bond_id(int bus_num, u8 bond_id) return 0; } +static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + bool ret = true; + + if (!hr_dev) { + if (bond_grp && + get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return true; + else + return false; + } + + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) { + ret = false; + goto out; + } + + if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) { + ret = false; + goto out; + } + + if (bond_grp->bus_num != get_hr_bus_num(hr_dev)) + ret = false; + +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool check_slave_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + if (is_dev_bond_supported(bond_grp, net_dev)) { + slave_num++; + continue; + } + rcu_read_unlock(); + return false; + } + rcu_read_unlock(); + + return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX); +} + +static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev, + struct net_device *upper_dev) +{ + bond_grp->upper_dev = upper_dev; + bond_grp->main_hr_dev = hr_dev; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->bond_ready = false; +} + +static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_bond_group *bond_grp_tmp; + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num); + return bond_grp_tmp == bond_grp; +} + +static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + mutex_lock(&bond_grp->bond_mutex); + + if (bond_grp->bond_ready && + bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE; + + mutex_unlock(&bond_grp->bond_mutex); +} + +static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + + if (!netif_is_lag_port(net_dev)) + return false; + + if (!lowerstate_event_filter(bond_grp, net_dev)) + return false; + + lowerstate_event_setting(bond_grp, info); + + return true; +} + +static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info) +{ + if (!bond_info) + return false; + + if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) + return false; + + if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH && + bond_info->hash_type > NETDEV_LAG_HASH_L23) + return false; + + return true; +} + +static void upper_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct netdev_lag_upper_info *bond_upper_info = NULL; + bool slave_inc = info->linking; + + if (slave_inc) + bond_upper_info = info->upper_info; + + if (bond_upper_info) { + bond_grp->tx_type = bond_upper_info->tx_type; + bond_grp->hash_type = bond_upper_info->hash_type; + } +} + +static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) { + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + slave_num++; + } + rcu_read_unlock(); + + return (slave_num > 1); +} + +static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info, + struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + if (!is_bond_setting_supported(bond_info)) + return false; + + return check_slave_support(bond_grp, upper_dev); +} + +static enum bond_support_type + check_bond_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev, + struct netdev_notifier_changeupper_info *info) +{ + bool bond_grp_exist = false; + bool support; + + if (upper_dev == bond_grp->upper_dev) + bond_grp_exist = true; + + if (!info->linking && !bond_grp_exist) + return BOND_NOT_SUPPORT; + + if (info->linking) + support = check_linking_bond_support(info->upper_info, bond_grp, + upper_dev); + else + support = check_unlinking_bond_support(bond_grp); + + if (support) + return BOND_SUPPORT; + + return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT; +} + +static bool upper_event_filter(struct netdev_notifier_changeupper_info *info, + struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct net_device *upper_dev = info->upper_dev; + struct hns_roce_bond_group *bond_grp_tmp; + struct hns_roce_dev *hr_dev; + bool ret = true; + u8 bus_num; + + if (!info->linking || + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return bond_grp->upper_dev == upper_dev; + + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + return false; + + bus_num = get_hr_bus_num(hr_dev); + if (bond_grp->bus_num != bus_num) { + ret = false; + goto out; + } + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp_tmp && bond_grp_tmp != bond_grp) + ret = false; +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + struct net_device *upper_dev = info->upper_dev; + enum bond_support_type support = BOND_SUPPORT; + struct hns_roce_dev *hr_dev; + int slave_id; + + if (!upper_dev || !netif_is_lag_master(upper_dev)) + return false; + + if (!upper_event_filter(info, bond_grp, net_dev)) + return false; + + mutex_lock(&bond_grp->bond_mutex); + support = check_bond_support(bond_grp, upper_dev, info); + if (support == BOND_NOT_SUPPORT) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev); + ib_device_put(&hr_dev->ib_dev); + } + + /* In the case of netdev being unregistered, the roce + * instance shouldn't be inited. + */ + if (net_dev->reg_state >= NETREG_UNREGISTERING) { + slave_id = get_netdev_bond_slave_id(net_dev, bond_grp); + if (slave_id >= 0) { + bond_grp->bond_func_info[slave_id].net_dev = NULL; + bond_grp->bond_func_info[slave_id].handle = NULL; + } + } + + if (support == BOND_SUPPORT) { + bond_grp->bond_ready = true; + if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM; + } + mutex_unlock(&bond_grp->bond_mutex); + if (support == BOND_SUPPORT) + upper_event_setting(bond_grp, info); + + return true; +} + +static int hns_roce_bond_event(struct notifier_block *self, + unsigned long event, void *ptr) +{ + struct hns_roce_bond_group *bond_grp = + container_of(self, struct hns_roce_bond_group, bond_nb); + bool changed = false; + + if (event == NETDEV_CHANGEUPPER) + changed = hns_roce_bond_upper_event(bond_grp, ptr); + if (event == NETDEV_CHANGELOWERSTATE) + changed = hns_roce_bond_lowerstate_event(bond_grp, ptr); + + return NOTIFY_DONE; +} + int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) { struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; @@ -149,6 +445,10 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) goto mem_err; } + mutex_init(&bond_grp->bond_mutex); + + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; bond_grp->bus_num = bus_num; ret = alloc_bond_id(bond_grp); @@ -158,16 +458,28 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) goto alloc_id_err; } + bond_grp->bond_nb.notifier_call = hns_roce_bond_event; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to register bond nb, ret = %d.\n", ret); + goto register_nb_err; + } bgrps[i] = bond_grp; } return 0; +register_nb_err: + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); alloc_id_err: + mutex_destroy(&bond_grp->bond_mutex); kvfree(bond_grp); mem_err: for (i--; i >= 0; i--) { + unregister_netdevice_notifier(&bgrps[i]->bond_nb); remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id); + mutex_destroy(&bgrps[i]->bond_mutex); kvfree(bgrps[i]); } return ret; @@ -185,7 +497,9 @@ void hns_roce_dealloc_bond_grp(void) bond_grp = die_info->bgrps[i]; if (!bond_grp) continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); + mutex_destroy(&bond_grp->bond_mutex); kvfree(bond_grp); } } diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index 61c52135588e..a11de04c42e9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -14,15 +14,41 @@ #define BOND_ID(id) BIT(id) +enum bond_support_type { + BOND_NOT_SUPPORT, + /* + * bond_grp already exists, but in the current + * conditions it's no longer supported + */ + BOND_EXISTING_NOT_SUPPORT, + BOND_SUPPORT, +}; + +enum hns_roce_bond_state { + HNS_ROCE_BOND_NOT_ATTACHED, + HNS_ROCE_BOND_NOT_BONDED, + HNS_ROCE_BOND_IS_BONDED, + HNS_ROCE_BOND_SLAVE_CHANGE_NUM, + HNS_ROCE_BOND_SLAVE_CHANGESTATE, +}; + struct hns_roce_func_info { struct net_device *net_dev; + struct hnae3_handle *handle; }; struct hns_roce_bond_group { struct net_device *upper_dev; + struct hns_roce_dev *main_hr_dev; u8 bond_id; u8 bus_num; struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; + bool bond_ready; + enum hns_roce_bond_state bond_state; + enum netdev_lag_tx_type tx_type; + enum netdev_lag_hash hash_type; + struct mutex bond_mutex; + struct notifier_block bond_nb; }; struct hns_roce_die_info { From 14f0455e4a61112583e61206a3d048cbecd7df86 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:06 +0800 Subject: [PATCH 43/65] RDMA/hns: Add bonding cmds Add three bonding cmds to configure bonding settings to HW. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_bond.h | 20 ++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 73 ++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 15 +++++ 3 files changed, 108 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index a11de04c42e9..84c94cbc397d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -14,6 +14,17 @@ #define BOND_ID(id) BIT(id) +enum { + BOND_MODE_1, + BOND_MODE_2_4, +}; + +enum hns_roce_bond_hashtype { + BOND_HASH_L2, + BOND_HASH_L34, + BOND_HASH_L23, +}; + enum bond_support_type { BOND_NOT_SUPPORT, /* @@ -32,6 +43,12 @@ enum hns_roce_bond_state { HNS_ROCE_BOND_SLAVE_CHANGESTATE, }; +enum hns_roce_bond_cmd_type { + HNS_ROCE_SET_BOND, + HNS_ROCE_CHANGE_BOND, + HNS_ROCE_CLEAR_BOND, +}; + struct hns_roce_func_info { struct net_device *net_dev; struct hnae3_handle *handle; @@ -40,6 +57,9 @@ struct hns_roce_func_info { struct hns_roce_bond_group { struct net_device *upper_dev; struct hns_roce_dev *main_hr_dev; + u8 active_slave_num; + u32 slave_map; + u32 active_slave_map; u8 bond_id; u8 bus_num; struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6750bad9bb53..2952684011a9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1431,6 +1431,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, return ret; } +static enum hns_roce_opcode_type + get_bond_opcode(enum hns_roce_bond_cmd_type bond_type) +{ + switch (bond_type) { + case HNS_ROCE_SET_BOND: + return HNS_ROCE_OPC_SET_BOND_INFO; + case HNS_ROCE_CHANGE_BOND: + return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT; + case HNS_ROCE_CLEAR_BOND: + return HNS_ROCE_OPC_CLEAR_BOND_INFO; + default: + WARN(true, "Invalid bond type %d!\n", bond_type); + return HNS_ROCE_OPC_SET_BOND_INFO; + } +} + +static enum hns_roce_bond_hashtype + get_bond_hashtype(enum netdev_lag_hash netdev_hashtype) +{ + switch (netdev_hashtype) { + case NETDEV_LAG_HASH_L2: + return BOND_HASH_L2; + case NETDEV_LAG_HASH_L34: + return BOND_HASH_L34; + case NETDEV_LAG_HASH_L23: + return BOND_HASH_L23; + default: + WARN(true, "Invalid hash type %d!\n", netdev_hashtype); + return BOND_HASH_L2; + } +} + +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type) +{ + enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type); + struct hns_roce_bond_info *slave_info; + struct hns_roce_cmq_desc desc = {}; + int ret; + + slave_info = (struct hns_roce_bond_info *)desc.data; + hns_roce_cmq_setup_basic_desc(&desc, opcode, false); + + slave_info->bond_id = cpu_to_le32(bond_grp->bond_id); + if (bond_type == HNS_ROCE_CLEAR_BOND) + goto out; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_1); + if (bond_grp->active_slave_num != 1) + ibdev_warn(&bond_grp->main_hr_dev->ib_dev, + "active slave cnt(%u) in Mode 1 is invalid.\n", + bond_grp->active_slave_num); + } else { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4); + slave_info->hash_policy = + cpu_to_le32(get_bond_hashtype(bond_grp->hash_type)); + } + + slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num); + slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map); + slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map); + +out: + ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1); + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "cmq bond type(%d) failed, ret = %d.\n", + bond_type, ret); + + return ret; +} + static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, dma_addr_t base_addr, u8 cmd, unsigned long tag) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e64a04d6f85b..82cec4b38c92 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -35,6 +35,7 @@ #include #include "hnae3.h" +#include "hns_roce_bond.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 @@ -228,6 +229,9 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, + HNS_ROCE_OPC_SET_BOND_INFO = 0x8601, + HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602, + HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603, }; #define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 @@ -1465,7 +1469,18 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; }; +struct hns_roce_bond_info { + __le32 bond_id; + __le32 bond_mode; + __le32 active_slave_cnt; + __le32 active_slave_mask; + __le32 slave_mask; + __le32 hash_policy; +}; + int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type); static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) From d9023e461b73e2918c689e09b2ecf72389632da3 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:07 +0800 Subject: [PATCH 44/65] RDMA/hns: Implement bonding init/uninit process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement hns_roce_slave_init() and hns_roce_slave_uninit() for device init/uninit in bonding cases. The former is used to initialize a slave ibdev (when the slave is unlinked from a bond) or a bond ibdev, while the latter does the opposite. Most of the process is the same as regular device init/uninit, while some bonding‑specific steps below are also added. In bond device init flow, choose one slave to re-initialize as the main_hr_dev of the bond, and it will be the only device presented for multiple slaves. During registration, set and active netdev to the ibdev based on the link state of the slaves. When this main_hr_dev slave is being unlinked while the bond is still valid, choose a new slave from the rest and initialize it as the new bond device. In uninit flow, add a bond cleanup process, restore all the other slaves and clean up bond resource. This is only for the case where the port of main_hr_dev is directly removed without unlinking it from bond. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_bond.c | 178 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 6 + drivers/infiniband/hw/hns/hns_roce_device.h | 3 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 41 ++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 + drivers/infiniband/hw/hns/hns_roce_main.c | 66 ++++++-- 6 files changed, 282 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index ce3646ed3fbf..e09fe7ef16ee 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -3,6 +3,7 @@ * Copyright (c) 2025 Hisilicon Limited. */ +#include #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" #include "hns_roce_bond.h" @@ -74,6 +75,143 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, return NULL; } +static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + struct net_device *active_dev; + struct net_device *old_dev; + int i, ret = 0; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + rcu_read_lock(); + active_dev = + bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev)); + rcu_read_unlock(); + } else { + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + active_dev = bond_grp->bond_func_info[i].net_dev; + if (active_dev && + ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE) + break; + } + } + + if (!active_dev || i == ROCE_BOND_FUNC_MAX) + active_dev = get_hr_netdev(hr_dev, 0); + + old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1); + if (old_dev == active_dev) + goto out; + + ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1); + if (ret) { + dev_err(hr_dev->dev, "failed to set netdev for bond.\n"); + goto out; + } + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + if (old_dev) + roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev); + rdma_roce_rescan_port(&hr_dev->ib_dev, 1); + } +out: + dev_put(old_dev); + return ret; +} + +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED && + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return true; + + return false; +} + +static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp, + u8 func_idx) +{ + struct hnae3_handle *handle; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle->priv) + hns_roce_bond_uninit_client(bond_grp, func_idx); +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch); + +static int switch_main_dev(struct hns_roce_bond_group *bond_grp, + u8 main_func_idx) +{ + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + u8 i; + + bond_grp->main_hr_dev = NULL; + hns_roce_bond_uninit_client(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if ((bond_grp->slave_map & (1U << i)) && net_dev) { + /* In case this slave is still being registered as + * a non-bonded PF, uninit it first and then re-init + * it as the main device. + */ + hns_roce_slave_uninit(bond_grp, i); + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) + return -ENODEV; + + return 0; +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch) +{ + struct hns_roce_dev *hr_dev = NULL; + struct hnae3_handle *handle; + u8 main_func_idx; + int ret; + + if (need_switch) { + main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + if (func_idx == main_func_idx) { + ret = switch_main_dev(bond_grp, main_func_idx); + if (ret == -ENODEV) + return NULL; + } + } + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle) { + if (handle->priv) + return handle->priv; + /* Prevent this device from being initialized as a bond device */ + if (need_switch) + bond_grp->bond_func_info[func_idx].net_dev = NULL; + hr_dev = hns_roce_bond_init_client(bond_grp, func_idx); + if (!hr_dev) + BOND_ERR_LOG("failed to init slave %u.\n", func_idx); + } + + return hr_dev; +} + static struct hns_roce_die_info *alloc_die_info(int bus_num) { struct hns_roce_die_info *die_info; @@ -204,6 +342,35 @@ static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp, bond_grp->bond_ready = false; } +static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp) +{ + mutex_lock(&bond_grp->bond_mutex); + + bond_grp->upper_dev = NULL; + bond_grp->main_hr_dev = NULL; + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; + bond_grp->slave_map = 0; + memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info)); + + mutex_unlock(&bond_grp->bond_mutex); +} + +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO; + if (ret) + BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE clear bond finished!\n"); + + hns_roce_detach_bond_grp(bond_grp); +} + static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp, struct net_device *net_dev) { @@ -504,3 +671,14 @@ void hns_roce_dealloc_bond_grp(void) } } } + +int hns_roce_bond_init(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + + return hns_roce_set_bond_netdev(bond_grp, hr_dev); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index 84c94cbc397d..3ef7d28379cc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -14,6 +14,9 @@ #define BOND_ID(id) BIT(id) +#define BOND_ERR_LOG(fmt, ...) \ + pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__) + enum { BOND_MODE_1, BOND_MODE_2_4, @@ -80,5 +83,8 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, u8 bus_num); int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev); void hns_roce_dealloc_bond_grp(void); +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); +int hns_roce_bond_init(struct hns_roce_dev *hr_dev); #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index cc1402fc8943..0add49d9664b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -179,6 +179,7 @@ enum hns_roce_instance_state { HNS_ROCE_STATE_INIT, HNS_ROCE_STATE_INITED, HNS_ROCE_STATE_UNINIT, + HNS_ROCE_STATE_BOND_UNINIT, }; enum { @@ -1304,7 +1305,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); -void hns_roce_exit(struct hns_roce_dev *hr_dev); +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup); int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2952684011a9..99c9db255346 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7141,7 +7141,7 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) } static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, - bool reset) + bool reset, bool bond_cleanup) { struct hns_roce_dev *hr_dev = handle->priv; @@ -7153,7 +7153,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - hns_roce_exit(hr_dev); + hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } @@ -7209,7 +7209,40 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; - __hns_roce_hw_v2_uninit_instance(handle, reset); + __hns_roce_hw_v2_uninit_instance(handle, reset, true); + + handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; +} + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle; + int ret; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (!handle || !handle->client) + return NULL; + + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) + return NULL; + + return handle->priv; +} + +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle; + + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) + return; + + handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT; + + __hns_roce_hw_v2_uninit_instance(handle, false, false); handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } @@ -7278,7 +7311,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT; dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n"); msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY); - __hns_roce_hw_v2_uninit_instance(handle, false); + __hns_roce_hw_v2_uninit_instance(handle, false, false); return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 82cec4b38c92..285fe0875fac 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1478,6 +1478,11 @@ struct hns_roce_bond_info { __le32 hash_policy; }; +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx); +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx); int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, enum hns_roce_bond_cmd_type bond_type); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 7fa25586ccd8..4e8807a04298 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -614,9 +614,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device, return num_counters; } -static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) +static void + hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev, + struct hns_roce_bond_group *bond_grp) { + struct net_device *net_dev; + int i; + + /* To avoid the loss of other slave devices when main_hr_dev + * is unregistered, re-initialize the remaining slaves before + * the bond resources cleanup. + */ + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && net_dev != get_hr_netdev(hr_dev, 0)) + hns_roce_bond_init_client(bond_grp, i); + } + + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, + bool bond_cleanup) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) + hns_roce_unregister_bond_cleanup(hr_dev, bond_grp); + } hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); @@ -746,6 +777,8 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); + dma_set_max_seg_size(dev, SZ_2G); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { ret = hns_roce_alloc_bond_grp(hr_dev); if (ret) { @@ -755,17 +788,26 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) } } - for (i = 0; i < hr_dev->caps.num_ports; i++) { - net_dev = get_hr_netdev(hr_dev, i); - if (!net_dev) - continue; - - ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); - if (ret) + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND && + hns_roce_bond_is_active(hr_dev)) { + ret = hns_roce_bond_init(hr_dev); + if (ret) { + dev_err(dev, "failed to init bond!\n"); return ret; + } + ret = ib_register_device(ib_dev, "hns_bond_%d", dev); + } else { + for (i = 0; i < hr_dev->caps.num_ports; i++) { + net_dev = get_hr_netdev(hr_dev, i); + if (!net_dev) + continue; + + ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); + if (ret) + return ret; + } + ret = ib_register_device(ib_dev, "hns_%d", dev); } - dma_set_max_seg_size(dev, SZ_2G); - ret = ib_register_device(ib_dev, "hns_%d", dev); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; @@ -1165,10 +1207,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) return ret; } -void hns_roce_exit(struct hns_roce_dev *hr_dev) +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup) { hns_roce_unregister_debugfs(hr_dev); - hns_roce_unregister_device(hr_dev); + hns_roce_unregister_device(hr_dev, bond_cleanup); if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); From 5d91677bbb6435761b51a7154f59dc70af333f4b Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:08 +0800 Subject: [PATCH 45/65] RDMA/hns: Add delayed work for bonding When conditions are met, schedule a delayed work in bond event handler to perform bonding operation according to the bond state. In the case of changing slave number or link state, re-set the netdev for the bond ibdev after the modification is complete, since these two operations may not call hns_roce_set_bond_netdev() in hns_roce_init(). The delayed work will be paused when there is a driver reset or exit to avoid concurrency. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-7-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_bond.c | 308 +++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 5 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +- 3 files changed, 325 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index e09fe7ef16ee..0604ee55011e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -3,6 +3,7 @@ * Copyright (c) 2025 Hisilicon Limited. */ +#include #include #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" @@ -133,6 +134,32 @@ bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) return false; } +static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u32 active_slave_map = 0; + u8 active_slave_num = 0; + bool active; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (!net_dev || !(bond_grp->slave_map & (1U << i))) + continue; + + active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ? + net_lag_port_dev_txable(net_dev) : + (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE); + if (active) { + active_slave_num++; + active_slave_map |= (1U << i); + } + } + + bond_grp->active_slave_num = active_slave_num; + bond_grp->active_slave_map = active_slave_map; +} + static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp, u8 func_idx) { @@ -227,11 +254,14 @@ static struct hns_roce_die_info *alloc_die_info(int bus_num) return NULL; } + mutex_init(&die_info->die_mutex); + return die_info; } static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num) { + mutex_destroy(&die_info->die_mutex); xa_erase(&roce_bond_xa, bus_num); kfree(die_info); } @@ -280,6 +310,168 @@ static int remove_bond_id(int bus_num, u8 bond_id) return 0; } +static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) +{ + struct hns_roce_dev *hr_dev; + int ret; + int i; + + for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) { + if (bond_grp->slave_map & (1 << i)) + hns_roce_slave_uninit(bond_grp, i); + } + + mutex_lock(&bond_grp->bond_mutex); + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + bond_grp->main_hr_dev = NULL; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1 << i)) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE set bond finished!\n"); + } +} + +static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) +{ + u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + struct hns_roce_dev *hr_dev; + u8 i; + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + goto out; + + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->main_hr_dev = NULL; + + hns_roce_slave_uninit(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) + bond_grp->main_hr_dev = hr_dev; + } + +out: + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to change RoCE bond slave state, ret = %d.\n", + ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave changestate finished!\n"); +} + +static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp) +{ + int ret; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1U << i)) { + if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn)) + continue; + hns_roce_slave_uninit(bond_grp, i); + } else { + hns_roce_slave_init(bond_grp, i, true); + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + bond_grp->bond_func_info[i].net_dev = NULL; + bond_grp->bond_func_info[i].handle = NULL; + } + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave change num finished!\n"); + } +} + +static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct hns_roce_v2_priv *priv; + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + int func_idx; + + bond_grp->slave_map = 0; + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + func_idx = get_netdev_bond_slave_id(net_dev, bond_grp); + if (func_idx < 0) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + continue; + func_idx = PCI_FUNC(hr_dev->pci_dev->devfn); + if (!bond_grp->bond_func_info[func_idx].net_dev) { + priv = hr_dev->priv; + bond_grp->bond_func_info[func_idx].net_dev = + net_dev; + bond_grp->bond_func_info[func_idx].handle = + priv->handle; + } + ib_device_put(&hr_dev->ib_dev); + } + + bond_grp->slave_map |= (1 << func_idx); + } + rcu_read_unlock(); +} + static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp, struct net_device *net_dev) { @@ -332,6 +524,50 @@ static bool check_slave_support(struct hns_roce_bond_group *bond_grp, return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX); } +static void hns_roce_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct hns_roce_bond_group *bond_grp = + container_of(delayed_work, struct hns_roce_bond_group, + bond_work); + enum hns_roce_bond_state bond_state; + bool bond_ready; + + mutex_lock(&bond_grp->bond_mutex); + bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev); + hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev); + bond_state = bond_grp->bond_state; + bond_grp->bond_ready = bond_ready; + mutex_unlock(&bond_grp->bond_mutex); + + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "bond work: bond_ready - %d, bond_state - %d.\n", + bond_ready, bond_state); + + if (!bond_ready) { + hns_roce_clear_bond(bond_grp); + return; + } + + switch (bond_state) { + case HNS_ROCE_BOND_NOT_BONDED: + hns_roce_set_bond(bond_grp); + /* In set_bond flow, we don't need to set bond netdev here as + * it has been done when bond_grp->main_hr_dev is registered. + */ + return; + case HNS_ROCE_BOND_SLAVE_CHANGESTATE: + hns_roce_slave_changestate(bond_grp); + break; + case HNS_ROCE_BOND_SLAVE_CHANGE_NUM: + hns_roce_slave_change_num(bond_grp); + break; + default: + return; + } + hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev); +} + static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp, struct hns_roce_dev *hr_dev, struct net_device *upper_dev) @@ -346,6 +582,7 @@ static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp) { mutex_lock(&bond_grp->bond_mutex); + cancel_delayed_work(&bond_grp->bond_work); bond_grp->upper_dev = NULL; bond_grp->main_hr_dev = NULL; bond_grp->bond_ready = false; @@ -591,6 +828,9 @@ static int hns_roce_bond_event(struct notifier_block *self, if (event == NETDEV_CHANGELOWERSTATE) changed = hns_roce_bond_lowerstate_event(bond_grp, ptr); + if (changed) + schedule_delayed_work(&bond_grp->bond_work, HZ); + return NOTIFY_DONE; } @@ -613,6 +853,7 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) } mutex_init(&bond_grp->bond_mutex); + INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work); bond_grp->bond_ready = false; bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; @@ -645,6 +886,7 @@ int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) mem_err: for (i--; i >= 0; i--) { unregister_netdevice_notifier(&bgrps[i]->bond_nb); + cancel_delayed_work_sync(&bgrps[i]->bond_work); remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id); mutex_destroy(&bgrps[i]->bond_mutex); kvfree(bgrps[i]); @@ -665,6 +907,7 @@ void hns_roce_dealloc_bond_grp(void) if (!bond_grp) continue; unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); mutex_destroy(&bond_grp->bond_mutex); kvfree(bond_grp); @@ -682,3 +925,68 @@ int hns_roce_bond_init(struct hns_roce_dev *hr_dev) return hns_roce_set_bond_netdev(bond_grp, hr_dev); } + +void hns_roce_bond_suspend(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + /* + * Avoid duplicated processing when calling this function + * multiple times. + */ + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); + } + +out: + die_info->suspend_cnt++; + mutex_unlock(&die_info->die_mutex); +} + +void hns_roce_bond_resume(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i, ret; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + die_info->suspend_cnt--; + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) + dev_err(&handle->pdev->dev, + "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n", + bus_num, bond_grp->bond_id, ret); + } + +out: + mutex_unlock(&die_info->die_mutex); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index 3ef7d28379cc..98c295d78ca1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -72,11 +72,14 @@ struct hns_roce_bond_group { enum netdev_lag_hash hash_type; struct mutex bond_mutex; struct notifier_block bond_nb; + struct delayed_work bond_work; }; struct hns_roce_die_info { u8 bond_id_mask; struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct mutex die_mutex; + u8 suspend_cnt; }; struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, @@ -86,5 +89,7 @@ void hns_roce_dealloc_bond_grp(void); void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); int hns_roce_bond_init(struct hns_roce_dev *hr_dev); +void hns_roce_bond_suspend(struct hnae3_handle *handle); +void hns_roce_bond_resume(struct hnae3_handle *handle); #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 99c9db255346..4c8c69b8e3ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7204,14 +7204,20 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) - return; + goto out; handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; __hns_roce_hw_v2_uninit_instance(handle, reset, true); handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; + +out: + hns_roce_bond_resume(handle); } struct hns_roce_dev @@ -7251,6 +7257,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) { set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state); return 0; @@ -7281,6 +7290,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state)) { handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED; + hns_roce_bond_resume(handle); return 0; } @@ -7300,6 +7310,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) dev_info(dev, "reset done, RoCE client reinit finished.\n"); } + hns_roce_bond_resume(handle); return ret; } From e72d274f8f5b9912c8e6590004a470a1ad8f4983 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:09 +0800 Subject: [PATCH 46/65] RDMA/hns: Support link state reporting for bond The link state of bond depends on the upper device. Adapt current link state querying flow and ib_event dispatching flow to report correct link state of bond. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-8-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 8 ++ drivers/infiniband/hw/hns/hns_roce_main.c | 99 ++++++++++++++++------ 2 files changed, 83 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4c8c69b8e3ff..3f2170647061 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7358,6 +7358,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, if (linkup || !hr_dev) return; + /* For bond device, the link status depends on the upper netdev, + * and the upper device's link status depends on all the slaves' + * netdev but not only one. So bond device cannot get a correct + * link status from this path. + */ + if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev))) + return; + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); } diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 4e8807a04298..fc62efdc45eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) return ret; } -static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port, - unsigned long event) +static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num, + enum ib_port_state *state) { + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + struct net_device *net_dev; + + net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num); + if (!net_dev) + return -ENODEV; + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) { + *state = ib_get_curr_port_state(bond_grp->upper_dev); + goto out; + } + } + + *state = ib_get_curr_port_state(net_dev); +out: + dev_put(net_dev); + return 0; +} + +static int handle_en_event(struct net_device *netdev, + struct hns_roce_dev *hr_dev, + u32 port, unsigned long event) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; struct device *dev = hr_dev->dev; - struct net_device *netdev; + enum ib_port_state curr_state; + struct ib_event ibevent; int ret = 0; - netdev = hr_dev->iboe.netdevs[port]; if (!netdev) { dev_err(dev, "can't find netdev on port(%u)!\n", port); return -ENODEV; } switch (event) { - case NETDEV_UP: - case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_CHANGEADDR: ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); break; + case NETDEV_UP: + case NETDEV_CHANGE: + ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); + if (ret) + return ret; + fallthrough; case NETDEV_DOWN: - /* - * In v1 engine, only support all ports closed together. - */ + if (!netif_is_lag_master(netdev)) + break; + curr_state = ib_get_curr_port_state(netdev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return 0; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port + 1; + ib_dispatch_event(&ibevent); break; default: dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event)); @@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct hns_roce_bond_group *bond_grp; struct hns_roce_ib_iboe *iboe = NULL; struct hns_roce_dev *hr_dev = NULL; + struct net_device *upper = NULL; int ret; u32 port; hr_dev = container_of(self, struct hns_roce_dev, iboe.nb); iboe = &hr_dev->iboe; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), + get_hr_bus_num(hr_dev)); + upper = bond_grp ? bond_grp->upper_dev : NULL; + } for (port = 0; port < hr_dev->caps.num_ports; port++) { - if (dev == iboe->netdevs[port]) { - ret = handle_en_event(hr_dev, port, event); + if ((!upper && dev == iboe->netdevs[port]) || + (upper && dev == upper)) { + ret = handle_en_event(dev, hr_dev, port, event); if (ret) return NOTIFY_DONE; break; @@ -222,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, struct ib_port_attr *props) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); - struct device *dev = hr_dev->dev; struct net_device *net_dev; - unsigned long flags; enum ib_mtu mtu; u32 port; int ret; @@ -245,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, if (ret) ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret); - spin_lock_irqsave(&hr_dev->iboe.lock, flags); - - net_dev = get_hr_netdev(hr_dev, port); + net_dev = ib_device_get_netdev(ib_dev, port_num); if (!net_dev) { - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - dev_err(dev, "find netdev %u failed!\n", port); + ibdev_err(ib_dev, "find netdev %u failed!\n", port); return -EINVAL; } mtu = iboe_get_mtu(net_dev->mtu); props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256; - props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ? - IB_PORT_ACTIVE : - IB_PORT_DOWN; + + dev_put(net_dev); + + ret = hns_roce_get_port_state(hr_dev, port_num, &props->state); + if (ret) { + ibdev_err(ib_dev, "failed to get port state.\n"); + return ret; + } + props->phys_state = props->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; - - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - return 0; } From d70f30cef2dfa65757d8146d6d0524b6872e9845 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 12 Nov 2025 17:35:10 +0800 Subject: [PATCH 47/65] RDMA/hns: Support reset recovery for bond Re-set bond configuration to HW after HW reset. Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251112093510.3696363-9-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_bond.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index 0604ee55011e..cc85f3ce1f3e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -160,6 +160,15 @@ static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) bond_grp->active_slave_map = active_slave_map; } +static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + bond_grp->main_hr_dev = hr_dev; + hns_roce_bond_get_active_slave(bond_grp); + + return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); +} + static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp, u8 func_idx) { @@ -918,11 +927,22 @@ void hns_roce_dealloc_bond_grp(void) int hns_roce_bond_init(struct hns_roce_dev *hr_dev) { struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_bond_group *bond_grp; u8 bus_num = get_hr_bus_num(hr_dev); + int ret; bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) { + ret = hns_roce_recover_bond(bond_grp, hr_dev); + if (ret) { + dev_err(hr_dev->dev, + "failed to recover RoCE bond, ret = %d.\n", ret); + return ret; + } + } + return hns_roce_set_bond_netdev(bond_grp, hr_dev); } From 6afe40ff484a1155b71158b911c65299496e35c3 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 19 Nov 2025 23:36:54 -0800 Subject: [PATCH 48/65] RDMA/bnxt_re: Fix the inline size for GenP7 devices Inline size supported by the device is based on the number of SGEs supported by the adapter. Change the inline size calculation based on that. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1763624215-10382-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9ef581ed785c..a9afac2cbb7c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; attr->max_pkey = 1; - attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge); if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); From a26c4c7cdb50247b8486f1caa1ea8ab5e5c37edf Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 19 Nov 2025 23:36:55 -0800 Subject: [PATCH 49/65] RDMA/bnxt_re: Pass correct flag for dma mr creation DMA MR doesn't use the unified MR model. So the lkey passed on to the reg_mr command to FW should contain the correct lkey. Driver is incorrectly over writing the lkey with pdid and firmware commands fails due to this. Avoid passing the wrong key for cases where the unified MR registration is not used. Fixes: f786eebbbefa ("RDMA/bnxt_re: Avoid an extra hwrm per MR creation") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1763624215-10382-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 +++++--- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 6 +++--- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 4dab5ca7362b..13e9667ee04a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, - BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE); + BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n"); goto fail; @@ -4032,7 +4033,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.hwq.level = PBL_LVL_MAX; mr->qplib_mr.total_size = -1; /* Infinte length */ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0, - PAGE_SIZE); + PAGE_SIZE, false); if (rc) goto fail_mr; @@ -4262,7 +4263,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, - umem_pgs, page_size); + umem_pgs, page_size, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc); rc = -EIO; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index a9afac2cbb7c..408a34df2667 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, } int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size) + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_hwq_attr hwq_attr = {}; @@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + if (unified_mr) req.key = cpu_to_le32(mr->pd->id); req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); @@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + if (unified_mr) { mr->lkey = le32_to_cpu(resp.xid); mr->rkey = mr->lkey; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 147b5d9c0313..5a45c55c6464 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, bool block); int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size); + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr); int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr); int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, int max); From 6dbd547adad534c0daad13ca9e1f862278ca955b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Nov 2025 16:49:28 +0200 Subject: [PATCH 50/65] IB/mlx5: Reduce IMR KSM size when 5-level paging is enabled Enabling 5-level paging (LA57) increases TASK_SIZE on x86_64 from 2^47 to 2^56. This affects implicit ODP, which uses TASK_SIZE to calculate the number of IMR KSM entries. As a result, the number of entries and the memory usage for KSM mkeys increase drastically: - With 2^47 TASK_SIZE: 0x20000 entries (~2MB) - With 2^56 TASK_SIZE: 0x4000000 entries (~1GB) This issue could happen previously on systems with LA57 manually enabled, but now commit 7212b58d6d71 ("x86/mm/64: Make 5-level paging support unconditional") enables LA57 by default on all supported systems. This makes the issue impact widespread. To mitigate this, increase the size each MTT entry maps from 1GB to 16GB when 5-level paging is enabled. This reduces the number of KSM entries and lowers the memory usage on LA57 systems from 1GB to 64MB per IMR. As now 'mlx5_imr_mtt_size' is larger than 32 bits, we move to use u64 instead of int as part of populate_klm() to prevent overflow of the 'step' variable. In addition, as populate_klm() actually handles KSM and not KLM, as it's used only by implicit ODP, we renamed its signature and the internal structures accordingly while dropping the byte_count handling which is not relevant in KSM. The page size in KSM is fixed for all the entries and come from the log_page_size of the mkey. Note: On platforms where the calculated value for 'mlx5_imr_ksm_page_shift' is higher than the max firmware cap to be changed over UMR, or that the calculated value for 'log_va_pages' is higher than what we may expect, the implicit ODP cap will be simply turned off. Co-developed-by: Or Har-Toov Signed-off-by: Or Har-Toov Signed-off-by: Yishai Hadas Reviewed-by: Michael Guralnik Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251120-reduce-ksm-v1-1-6864bfc814dc@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 91 ++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 6441abdf1f3b..e71ee3d52eb0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -97,33 +97,28 @@ struct mlx5_pagefault { * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) -#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) -#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) -#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) -#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) - -#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT - static u64 mlx5_imr_ksm_entries; +static u64 mlx5_imr_mtt_entries; +static u64 mlx5_imr_mtt_size; +static u8 mlx5_imr_mtt_shift; +static u8 mlx5_imr_ksm_page_shift; -static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, +static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; - struct mlx5_klm *end = pklm + nentries; - int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + struct mlx5_ksm *end = pksm + nentries; + u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? cpu_to_be32(imr->null_mmkey.key) : mr_to_mdev(imr)->mkeys.null_mkey; u64 va = - MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++, va += step) { - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = key; - pklm->va = cpu_to_be64(va); + for (; pksm != end; pksm++, idx++, va += step) { + pksm->key = key; + pksm->va = cpu_to_be64(va); } return; } @@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++, va += step) { + for (; pksm != end; pksm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); if (mtt) { - pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); + pksm->key = cpu_to_be32(mtt->ibmr.lkey); + pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); } else { - pklm->key = key; - pklm->va = cpu_to_be64(va); + pksm->key = key; + pksm->va = cpu_to_be64(va); } } } @@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { - populate_klm(xlt, idx, nentries, mr, flags); + populate_ksm(xlt, idx, nentries, mr, flags); return 0; } else { return populate_mtt(xlt, idx, nentries, mr, flags); @@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) mutex_lock(&odp_imr->umem_mutex); mlx5r_umr_update_xlt(mr->parent, - ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, + ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); mlx5_ib_dereg_mr(&mr->ibmr, NULL); @@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; struct mlx5_ib_mr *imr = mr->parent; /* @@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && MLX5_CAP_GEN(dev->mdev, null_mkey) && MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && - !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && + mlx5_imr_ksm_entries != 0 && + !(mlx5_imr_ksm_page_shift > + get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; } @@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, int err; odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), - idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + idx * mlx5_imr_mtt_size, + mlx5_imr_mtt_size, &mlx5_mn_ops); if (IS_ERR(odp)) return ERR_CAST(odp); mr = mlx5_mr_cache_alloc(dev, imr->access_flags, MLX5_MKC_ACCESS_MODE_MTT, - MLX5_IMR_MTT_ENTRIES); + mlx5_imr_mtt_entries); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; + mr->ibmr.iova = idx * mlx5_imr_mtt_size; mr->parent = imr; odp->private = mr; @@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_set(&mr->mmkey.usecount, 2); err = mlx5r_umr_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, + mlx5_imr_mtt_entries, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) return ERR_PTR(-EOPNOTSUPP); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); @@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, + mlx5_imr_ksm_page_shift, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, struct ib_umem_odp *odp_imr, u64 user_va, size_t bcnt, u32 *bytes_mapped, u32 flags) { - unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; unsigned long upd_start_idx = end_idx + 1; unsigned long upd_len = 0; unsigned long npages = 0; int err; int ret; - if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || + mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) return -EFAULT; /* Fault each child mr that intersects with our interval. */ while (bcnt) { - unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = user_va >> mlx5_imr_mtt_shift; struct ib_umem_odp *umem_odp; struct mlx5_ib_mr *mtt; u64 len; @@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init(void) { - mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - - MLX5_IMR_MTT_BITS); + u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; + u8 mlx5_imr_mtt_bits; + /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ + if (log_va_pages <= 48 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 30; + /* 56 is x86-64, 5-level paging */ + else if (log_va_pages <= 56 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 34; + else + return 0; + + mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); + mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; + mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + mlx5_imr_mtt_bits); + + mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; return 0; } From 0f1f9b5e47cec229dc2127481807823b75e933b0 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 20 Nov 2025 17:15:15 +0200 Subject: [PATCH 51/65] RDMA/core: Add new IB rate for XDR (8x) support Add the new rates as defined in the Infiniband spec for XDR and 8x link width support. Furthermore, modify the utility conversion methods accordingly. Reference: IB Spec Release 1.8 Reviewed-by: Michael Guralnik Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/20251120-speed-8-v1-1-e6a7efef8cb8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 3 +++ include/rdma/ib_verbs.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3a5f81402d2f..11b1a194de44 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; case IB_RATE_800_GBPS: return 320; + case IB_RATE_1600_GBPS: return 640; default: return -1; } } @@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; case 320: return IB_RATE_800_GBPS; + case 640: return IB_RATE_1600_GBPS; default: return IB_RATE_PORT_CURRENT; } } @@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; case IB_RATE_800_GBPS: return 850000; + case IB_RATE_1600_GBPS: return 1700000; default: return -1; } } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0a85af610b6b..6aad66bc5dd7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -859,6 +859,7 @@ enum ib_rate { IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, + IB_RATE_1600_GBPS = 25, }; /** From 4022c7b6342a4d9a97e1e974e27efca95e79ed20 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 20 Nov 2025 17:15:16 +0200 Subject: [PATCH 52/65] RDMA/mlx5: Add support for 1600_8x lane speed Add a check for 1600G_8X link speed when querying PTYS and report it back correctly when needed. While at it, adjust mlx5 function which maps the speed rate from IB spec values to internal driver values to be able to handle speeds up to 1600Gbps. Reviewed-by: Michael Guralnik Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/20251120-speed-8-v1-2-e6a7efef8cb8@nvidia.com Reviewed-by: Kalesh AP Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 4 ++++ drivers/infiniband/hw/mlx5/qp.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 90daa58126f4..40284bbb45d6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_XDR; break; + case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 88724d15705d..69af20790481 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS || + rate == IB_RATE_1600_GBPS) return 0; - if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS) return -EINVAL; stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support); From a521928164433de44fed5aaf5f49aeb3f1fb96f5 Mon Sep 17 00:00:00 2001 From: Krzysztof Czurylo Date: Mon, 24 Nov 2025 20:53:42 -0600 Subject: [PATCH 53/65] RDMA/irdma: Fix data race in irdma_sc_ccq_arm Adds a lock around irdma_sc_ccq_arm body to prevent inter-thread data race. Fixes data race in irdma_sc_ccq_arm() reported by KCSAN: BUG: KCSAN: data-race in irdma_sc_ccq_arm [irdma] / irdma_sc_ccq_arm [irdma] read to 0xffff9d51b4034220 of 8 bytes by task 255 on cpu 11: irdma_sc_ccq_arm+0x36/0xd0 [irdma] irdma_cqp_ce_handler+0x300/0x310 [irdma] cqp_compl_worker+0x2a/0x40 [irdma] process_one_work+0x402/0x7e0 worker_thread+0xb3/0x6d0 kthread+0x178/0x1a0 ret_from_fork+0x2c/0x50 write to 0xffff9d51b4034220 of 8 bytes by task 89 on cpu 3: irdma_sc_ccq_arm+0x7e/0xd0 [irdma] irdma_cqp_ce_handler+0x300/0x310 [irdma] irdma_wait_event+0xd4/0x3e0 [irdma] irdma_handle_cqp_op+0xa5/0x220 [irdma] irdma_hw_flush_wqes+0xb1/0x300 [irdma] irdma_flush_wqes+0x22e/0x3a0 [irdma] irdma_cm_disconn_true+0x4c7/0x5d0 [irdma] irdma_disconnect_worker+0x35/0x50 [irdma] process_one_work+0x402/0x7e0 worker_thread+0xb3/0x6d0 kthread+0x178/0x1a0 ret_from_fork+0x2c/0x50 value changed: 0x0000000000024000 -> 0x0000000000034000 Fixes: 3f49d6842569 ("RDMA/irdma: Implement HW Admin Queue OPs") Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-2-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 57bf6815e71e..c17b1c14dfe2 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -3868,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp) */ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) { + unsigned long flags; u64 temp_val; u16 sw_cq_sel; u8 arm_next_se; u8 arm_seq_num; + spin_lock_irqsave(&ccq->dev->cqp_lock, flags); get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val); sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val); arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val); @@ -3883,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) | FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1); set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); + spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); dma_wmb(); /* make sure shadow area is updated before arming */ From 81f44409fb4f027d1e6d54edbeba5156ad94b214 Mon Sep 17 00:00:00 2001 From: Krzysztof Czurylo Date: Mon, 24 Nov 2025 20:53:43 -0600 Subject: [PATCH 54/65] RDMA/irdma: Fix data race in irdma_free_pble Protects pble_rsrc counters with mutex to prevent data race. Fixes the following data race in irdma_free_pble reported by KCSAN: BUG: KCSAN: data-race in irdma_free_pble [irdma] / irdma_free_pble [irdma] write to 0xffff91430baa0078 of 8 bytes by task 16956 on cpu 5: irdma_free_pble+0x3b/0xb0 [irdma] irdma_dereg_mr+0x108/0x110 [irdma] ib_dereg_mr_user+0x74/0x160 [ib_core] uverbs_free_mr+0x26/0x30 [ib_uverbs] destroy_hw_idr_uobject+0x4a/0x90 [ib_uverbs] uverbs_destroy_uobject+0x7b/0x330 [ib_uverbs] uobj_destroy+0x61/0xb0 [ib_uverbs] ib_uverbs_run_method+0x1f2/0x380 [ib_uverbs] ib_uverbs_cmd_verbs+0x365/0x440 [ib_uverbs] ib_uverbs_ioctl+0x111/0x190 [ib_uverbs] __x64_sys_ioctl+0xc9/0x100 do_syscall_64+0x44/0xa0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 read to 0xffff91430baa0078 of 8 bytes by task 16953 on cpu 2: irdma_free_pble+0x23/0xb0 [irdma] irdma_dereg_mr+0x108/0x110 [irdma] ib_dereg_mr_user+0x74/0x160 [ib_core] uverbs_free_mr+0x26/0x30 [ib_uverbs] destroy_hw_idr_uobject+0x4a/0x90 [ib_uverbs] uverbs_destroy_uobject+0x7b/0x330 [ib_uverbs] uobj_destroy+0x61/0xb0 [ib_uverbs] ib_uverbs_run_method+0x1f2/0x380 [ib_uverbs] ib_uverbs_cmd_verbs+0x365/0x440 [ib_uverbs] ib_uverbs_ioctl+0x111/0x190 [ib_uverbs] __x64_sys_ioctl+0xc9/0x100 do_syscall_64+0x44/0xa0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 value changed: 0x0000000000005a62 -> 0x0000000000005a68 Fixes: e8c4dbc2fcac ("RDMA/irdma: Add PBLE resource manager") Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-3-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/pble.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index 3091f9345f12..4a20ec891059 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -506,12 +506,14 @@ int irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_pble_alloc *palloc) { - pble_rsrc->freedpbles += palloc->total_cnt; - if (palloc->level == PBLE_LEVEL_2) free_lvl2(pble_rsrc, palloc); else irdma_prm_return_pbles(&pble_rsrc->pinfo, &palloc->level1.chunkinfo); + + mutex_lock(&pble_rsrc->pble_mutex_lock); + pble_rsrc->freedpbles += palloc->total_cnt; pble_rsrc->stats_alloc_freed++; + mutex_unlock(&pble_rsrc->pble_mutex_lock); } From 9e13d880ebae5da9b39ef2ed83a89737e927173f Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Mon, 24 Nov 2025 20:53:44 -0600 Subject: [PATCH 55/65] RDMA/irdma: Add a missing kfree of struct irdma_pci_f for GEN2 During a refactor of the irdma GEN2 code, the kfree of the irdma_pci_f struct in icrdma_remove(), which was originally introduced upstream as part of commit 80f2ab46c2ee ("irdma: free iwdev->rf after removing MSI-X") was accidentally removed. Fixes: 0c2b80cac96e ("RDMA/irdma: Refactor GEN2 auxiliary driver") Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-4-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/icrdma_if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c index 27b191f61caf..5d3fd118e4f8 100644 --- a/drivers/infiniband/hw/irdma/icrdma_if.c +++ b/drivers/infiniband/hw/irdma/icrdma_if.c @@ -320,6 +320,8 @@ static void icrdma_remove(struct auxiliary_device *aux_dev) irdma_ib_unregister_device(iwdev); icrdma_deinit_interrupts(iwdev->rf, cdev_info); + kfree(iwdev->rf); + pr_debug("INIT: Gen[%d] func[%d] device remove success\n", rdma_ver, PCI_FUNC(cdev_info->pdev->devfn)); } From 5eff1ecce30143c3f8924d91770d81d44bd5abe5 Mon Sep 17 00:00:00 2001 From: Krzysztof Czurylo Date: Mon, 24 Nov 2025 20:53:45 -0600 Subject: [PATCH 56/65] RDMA/irdma: Fix SIGBUS in AEQ destroy Removes write to IRDMA_PFINT_AEQCTL register prior to destroying AEQ, as this register does not exist in GEN3+ hardware and this kind of IRQ configuration is no longer required. Fixes: b800e82feba7 ("RDMA/irdma: Add GEN3 support for AEQ and CEQ") Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-5-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index c17b1c14dfe2..ce5cf89c463c 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -4635,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, u64 hdr; dev = aeq->dev; - if (dev->privileged) + + if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]); cqp = dev->cqp; From 35bd787babd1f5a42641d0b1513edbed5d4e1624 Mon Sep 17 00:00:00 2001 From: Anil Samal Date: Mon, 24 Nov 2025 20:53:46 -0600 Subject: [PATCH 57/65] RDMA/irdma: Add missing mutex destroy Add missing destroy of ah_tbl_lock and vchnl_mutex. Fixes: d5edd33364a5 ("RDMA/irdma: RDMA/irdma: Add GEN3 core driver support") Signed-off-by: Anil Samal Signed-off-by: Krzysztof Czurylo Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-6-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/icrdma_if.c | 4 +++- drivers/infiniband/hw/irdma/ig3rdma_if.c | 4 ++++ drivers/infiniband/hw/irdma/verbs.c | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c index 5d3fd118e4f8..b49fd9cf2476 100644 --- a/drivers/infiniband/hw/irdma/icrdma_if.c +++ b/drivers/infiniband/hw/irdma/icrdma_if.c @@ -302,7 +302,8 @@ static int icrdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary err_ctrl_init: icrdma_deinit_interrupts(rf, cdev_info); err_init_interrupts: - kfree(iwdev->rf); + mutex_destroy(&rf->ah_tbl_lock); + kfree(rf); ib_dealloc_device(&iwdev->ibdev); return err; @@ -319,6 +320,7 @@ static void icrdma_remove(struct auxiliary_device *aux_dev) ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); icrdma_deinit_interrupts(iwdev->rf, cdev_info); + mutex_destroy(&iwdev->rf->ah_tbl_lock); kfree(iwdev->rf); diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c index 1bb42eb298ba..e1d6670d9396 100644 --- a/drivers/infiniband/hw/irdma/ig3rdma_if.c +++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c @@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf, ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info); if (ret) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return ret; } @@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf) { struct irdma_hw *hw = &rf->hw; + mutex_destroy(&rf->ah_tbl_lock); destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); kfree(hw->io_regs); iounmap(hw->rdma_reg.addr); } @@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf, err = ig3rdma_cfg_regions(&rf->hw, cdev_info); if (err) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return err; } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 5fb6014ab8cb..1fbc3592e13d 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -5506,7 +5506,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) irdma_rt_deinit_hw(iwdev); if (!iwdev->is_vport) { irdma_ctrl_deinit_hw(iwdev->rf); - if (iwdev->rf->vchnl_wq) + if (iwdev->rf->vchnl_wq) { destroy_workqueue(iwdev->rf->vchnl_wq); + mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex); + } } } From 71d3bdae5eab21cf8991a6f3cd914caa31d5a51f Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 24 Nov 2025 20:53:47 -0600 Subject: [PATCH 58/65] RDMA/irdma: Do not directly rely on IB_PD_UNSAFE_GLOBAL_RKEY The HW disables bounds checking for MRs with a length of zero, so the driver will only allow a zero length MR if the "all_memory" flag is set, and this flag is only set if IB_PD_UNSAFE_GLOBAL_RKEY is set for the PD. This means that the "get_dma_mr" method will currently fail unless the IB_PD_UNSAFE_GLOBAL_RKEY flag is set. This has not been an issue because the "get_dma_mr" method is only ever invoked if the device does not support the local DMA key or if IB_PD_UNSAFE_GLOBAL_RKEY is set, and so far, all IRDMA HW supports the local DMA lkey. However, some new HW does not support the local DMA lkey, so the "get_dma_mr" method needs to work without IB_PD_UNSAFE_GLOBAL_RKEY being set. To support HW that does not allow the local DMA lkey, the logic has been changed to pass an explicit flag to indicate when a dma_mr is being created so that the zero length will be allowed. Also, the "all_memory" flag has been forced to false for normal MR allocation since these MRs are never supposed to provide global unsafe rkey semantics anyway; only the MR created with "get_dma_mr" should support this. Fixes: bb6d73d9add6 ("RDMA/irdma: Prevent zero-length STAG registration") Signed-off-by: Jacob Moroni Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-7-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/cm.c | 2 +- drivers/infiniband/hw/irdma/main.h | 2 +- drivers/infiniband/hw/irdma/verbs.c | 15 ++++++++------- drivers/infiniband/hw/irdma/verbs.h | 3 ++- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index c6a0a661d6e7..f4f4f92ba63a 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) iwpd = iwqp->iwpd; tagged_offset = (uintptr_t)iwqp->ietf_mem.va; ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len, - IB_ACCESS_LOCAL_WRITE, &tagged_offset); + IB_ACCESS_LOCAL_WRITE, &tagged_offset, false); if (IS_ERR(ibmr)) { ret = -ENOMEM; goto error; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index f22b1ee20fcc..baab61e424a2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src); u16 irdma_get_vlan_ipv4(u32 *addr); void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac); struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, - int acc, u64 *iova_start); + int acc, u64 *iova_start, bool dma_mr); int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw); void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 1fbc3592e13d..bfbfde16bcb9 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3116,7 +3116,6 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; info->pd_id = iwpd->sc_pd.pd_id; info->total_len = iwmr->len; - info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; info->remote_access = true; cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG; cqp_info->post_sq = 1; @@ -3127,7 +3126,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, if (status) return status; - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return 0; } @@ -3270,7 +3269,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; stag_info->pd_id = iwpd->sc_pd.pd_id; - stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; + stag_info->all_memory = iwmr->dma_mr; if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED) stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED; else @@ -3297,7 +3296,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); if (!ret) - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return ret; } @@ -3669,7 +3668,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) if (status) return status; - iwmr->is_hwreg = 0; + iwmr->is_hwreg = false; return 0; } @@ -3792,9 +3791,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags, * @size: size of memory to register * @access: Access rights * @iova_start: start of virtual address for physical buffers + * @dma_mr: Flag indicating whether this region is a PD DMA MR */ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access, - u64 *iova_start) + u64 *iova_start, bool dma_mr) { struct irdma_device *iwdev = to_iwdev(pd->device); struct irdma_pbl *iwpbl; @@ -3811,6 +3811,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access iwpbl = &iwmr->iwpbl; iwpbl->iwmr = iwmr; iwmr->type = IRDMA_MEMREG_TYPE_MEM; + iwmr->dma_mr = dma_mr; iwpbl->user_base = *iova_start; stag = irdma_create_stag(iwdev); if (!stag) { @@ -3849,7 +3850,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc) { u64 kva = 0; - return irdma_reg_phys_mr(pd, 0, 0, acc, &kva); + return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true); } /** diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index ed21c1b56e8e..6c05179d4a31 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -111,7 +111,8 @@ struct irdma_mr { }; struct ib_umem *region; int access; - u8 is_hwreg; + bool is_hwreg:1; + bool dma_mr:1; u16 type; u32 page_cnt; u64 page_size; From eef3ad030b08c0f100cb18de7f604442a1adb8c7 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 24 Nov 2025 20:53:48 -0600 Subject: [PATCH 59/65] RDMA/irdma: Do not set IBK_LOCAL_DMA_LKEY for GEN3+ The GEN3 hardware does not appear to support IBK_LOCAL_DMA_LKEY. Attempts to use it will result in an AE. Fixes: eb31dfc2b41a ("RDMA/irdma: Restrict Memory Window and CQE Timestamping to GEN3") Signed-off-by: Jacob Moroni Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-8-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index bfbfde16bcb9..9093d3e962ae 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev, irdma_fw_minor_ver(&rf->sc_dev); props->device_cap_flags = IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS; - props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3) + props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; props->vendor_id = pcidev->vendor; props->vendor_part_id = pcidev->device; From 62356fccb195f83d2ceafc787c5ba87ebbe5edfe Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 24 Nov 2025 20:53:49 -0600 Subject: [PATCH 60/65] RDMA/irdma: Remove doorbell elision logic In some cases, this logic can result in doorbell writes being skipped when they should not have been (at least on GEN3 HW), so remove it. This also means that the mb() can be safely downgraded to dma_wmb(). Fixes: 551c46edc769 ("RDMA/irdma: Add user/kernel shared libraries") Signed-off-by: Jacob Moroni Signed-off-by: Tatyana Nikolova Link: https://patch.msgid.link/20251125025350.180-9-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/puda.c | 1 - drivers/infiniband/hw/irdma/uk.c | 31 ++---------------------------- drivers/infiniband/hw/irdma/user.h | 1 - 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 3c29e2289322..cee47ddbd1b5 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) ukqp->rq_size = rsrc->rq_size; IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size); - IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size); IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size); ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db; diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 8a463b3d4c83..f0846b800913 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) */ void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) { - u64 temp; - u32 hw_sq_tail; - u32 sw_sq_head; - - /* valid bit is written and loads completed before reading shadow */ - mb(); - - /* read the doorbell shadow area */ - get_64bit_val(qp->shadow_area, 0, &temp); - - hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp); - sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); - if (sw_sq_head != qp->initial_ring.head) { - if (sw_sq_head != hw_sq_tail) { - if (sw_sq_head > qp->initial_ring.head) { - if (hw_sq_tail >= qp->initial_ring.head && - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } else { - if (hw_sq_tail >= qp->initial_ring.head || - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } - } - } - - qp->initial_ring.head = qp->sq_ring.head; + dma_wmb(); + writel(qp->qp_id, qp->wqe_alloc_db); } /** @@ -1606,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, qp->conn_wqes = move_cnt; IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt); IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt); - IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt); } /** @@ -1751,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) qp->max_sq_frag_cnt = info->max_sq_frag_cnt; sq_ring_size = qp->sq_size << info->sq_shift; IRDMA_RING_INIT(qp->sq_ring, sq_ring_size); - IRDMA_RING_INIT(qp->initial_ring, sq_ring_size); if (info->first_sq_wq) { irdma_setup_connection_wqes(qp, info); qp->swqe_polarity = 1; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 3a27cce3c3db..9eb7fd0b1cbf 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -457,7 +457,6 @@ struct irdma_srq_uk { struct irdma_uk_attrs *uk_attrs; __le64 *shadow_area; struct irdma_ring srq_ring; - struct irdma_ring initial_ring; u32 srq_id; u32 srq_size; u32 max_srq_frag_cnt; From 01dad9ca37c60d08f71e2ef639875ae895deede6 Mon Sep 17 00:00:00 2001 From: Jijun Wang Date: Mon, 24 Nov 2025 20:53:50 -0600 Subject: [PATCH 61/65] RDMA/irdma: Fix SRQ shadow area address initialization Fix SRQ shadow area address initialization. Fixes: 563e1feb5f6e ("RDMA/irdma: Add SRQ support") Signed-off-by: Jijun Wang Signed-off-by: Jay Bhat Link: https://patch.msgid.link/20251125025350.180-10-tatyana.e.nikolova@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 9093d3e962ae..435da34dfaa7 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2311,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev, ukinfo->srq_size = depth >> shift; ukinfo->shadow_area = mem->va + ring_size; - info->shadow_area_pa = info->srq_pa + ring_size; info->srq_pa = mem->pa; + info->shadow_area_pa = info->srq_pa + ring_size; return 0; } From f37e2868792335f2e8bbdcc02ebbb4830453f83c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 26 Nov 2025 10:51:47 +0800 Subject: [PATCH 62/65] RDMA/core: Reduce cond_resched() frequency in __ib_umem_release The current implementation calls cond_resched() for every SG entry in __ib_umem_release(), which can increase needless overhead. This patch introduces RESCHED_LOOP_CNT_THRESHOLD (0x1000) to limit how often cond_resched() is called. The function now yields the CPU once every 4096 iterations, and yield at the very first iteration for lots of small umem case, to reduce scheduling overhead. Fixes: d056bc45b62b ("RDMA/core: Prevent soft lockup during large user memory region cleanup") Signed-off-by: Li RongQing Link: https://patch.msgid.link/20251126025147.2627-1-lirongqing@baidu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8fd84aa37289..8137031c2a65 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -45,6 +45,8 @@ #include "uverbs.h" +#define RESCHED_LOOP_CNT_THRESHOLD 0x1000 + static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { bool make_dirty = umem->writable && dirty; @@ -58,7 +60,9 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { unpin_user_page_range_dirty_lock(sg_page(sg), DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); - cond_resched(); + + if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD)) + cond_resched(); } sg_free_append_table(&umem->sgt_append); From 155c9971fa88a6655431476f024566fcd9ddcdb6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 26 Nov 2025 09:57:37 +0200 Subject: [PATCH 63/65] RDMA/bng_re: Remove prefetch instruction The prefetch instruction is meant to speed up access to memory referenced by its address argument. In the bng_re code path, it has no effect, because the pointer refers to a function that is executed immediately afterward. The issue was identified due to the following kbuild compilation error: drivers/infiniband/hw/bng_re/bng_fw.c: In function 'bng_re_creq_irq': drivers/infiniband/hw/bng_re/bng_fw.c:278:9: error: implicit declaration of function 'prefetch' [-Wimplicit-function-declaration] 278 | prefetch(bng_re_get_qe(hwq, sw_cons, NULL)); | ^~~~~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511260607.Kuxn4NnN-lkp@intel.com/ Fixes: 4f830cd8d7fe ("RDMA/bng_re: Add infrastructure for enabling Firmware channel") Link: https://patch.msgid.link/20251126-remove-prefetch-v1-1-fcac22007ea7@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c index 803610fb9c58..7d9539113cf5 100644 --- a/drivers/infiniband/hw/bng_re/bng_fw.c +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -525,7 +525,7 @@ static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance) hwq = &creq->hwq; /* Prefetch the CREQ element */ sw_cons = HWQ_CMP(hwq->cons, hwq); - prefetch(bng_re_get_qe(hwq, sw_cons, NULL)); + bng_re_get_qe(hwq, sw_cons, NULL); tasklet_schedule(&creq->creq_tasklet); return IRQ_HANDLED; From 3a2c32d357db8d8d78f59359374d95633313d9de Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 26 Nov 2025 16:08:42 +0100 Subject: [PATCH 64/65] RDMA/siw: reclassify sockets in order to avoid false positives from lockdep While developing IPPROTO_SMBDIRECT support for the code under fs/smb/common/smbdirect [1], I noticed false positives like this: [T79] ====================================================== [T79] WARNING: possible circular locking dependency detected [T79] 6.18.0-rc4-metze-kasan-lockdep.01+ #1 Tainted: G OE [T79] ------------------------------------------------------ [T79] kworker/2:0/79 is trying to acquire lock: [T79] ffff88801f968278 (sk_lock-AF_INET){+.+.}-{0:0}, at: sock_set_reuseaddr+0x14/0x70 [T79] but task is already holding lock: [T79] ffffffffc10f7230 (lock#9){+.+.}-{4:4}, at: rdma_listen+0x3d2/0x740 [rdma_cm] [T79] which lock already depends on the new lock. [T79] the existing dependency chain (in reverse order) is: [T79] -> #1 (lock#9){+.+.}-{4:4}: [T79] __lock_acquire+0x535/0xc30 [T79] lock_acquire.part.0+0xb3/0x240 [T79] lock_acquire+0x60/0x140 [T79] __mutex_lock+0x1af/0x1c10 [T79] mutex_lock_nested+0x1b/0x30 [T79] cma_get_port+0xba/0x7d0 [rdma_cm] [T79] rdma_bind_addr_dst+0x598/0x9a0 [rdma_cm] [T79] cma_bind_addr+0x107/0x320 [rdma_cm] [T79] rdma_resolve_addr+0xa3/0x830 [rdma_cm] [T79] destroy_lease_table+0x12b/0x420 [ksmbd] [T79] ksmbd_NTtimeToUnix+0x3e/0x80 [ksmbd] [T79] ndr_encode_posix_acl+0x6e9/0xab0 [ksmbd] [T79] ndr_encode_v4_ntacl+0x53/0x870 [ksmbd] [T79] __sys_connect_file+0x131/0x1c0 [T79] __sys_connect+0x111/0x140 [T79] __x64_sys_connect+0x72/0xc0 [T79] x64_sys_call+0xe7d/0x26a0 [T79] do_syscall_64+0x93/0xff0 [T79] entry_SYSCALL_64_after_hwframe+0x76/0x7e [T79] -> #0 (sk_lock-AF_INET){+.+.}-{0:0}: [T79] check_prev_add+0xf3/0xcd0 [T79] validate_chain+0x466/0x590 [T79] __lock_acquire+0x535/0xc30 [T79] lock_acquire.part.0+0xb3/0x240 [T79] lock_acquire+0x60/0x140 [T79] lock_sock_nested+0x3b/0xf0 [T79] sock_set_reuseaddr+0x14/0x70 [T79] siw_create_listen+0x145/0x1540 [siw] [T79] iw_cm_listen+0x313/0x5b0 [iw_cm] [T79] cma_iw_listen+0x271/0x3c0 [rdma_cm] [T79] rdma_listen+0x3b1/0x740 [rdma_cm] [T79] cma_listen_on_dev+0x46a/0x750 [rdma_cm] [T79] rdma_listen+0x4b0/0x740 [rdma_cm] [T79] ksmbd_rdma_init+0x12b/0x270 [ksmbd] [T79] ksmbd_conn_transport_init+0x26/0x70 [ksmbd] [T79] server_ctrl_handle_work+0x1e5/0x280 [ksmbd] [T79] process_one_work+0x86c/0x1930 [T79] worker_thread+0x6f0/0x11f0 [T79] kthread+0x3ec/0x8b0 [T79] ret_from_fork+0x314/0x400 [T79] ret_from_fork_asm+0x1a/0x30 [T79] other info that might help us debug this: [T79] Possible unsafe locking scenario: [T79] CPU0 CPU1 [T79] ---- ---- [T79] lock(lock#9); [T79] lock(sk_lock-AF_INET); [T79] lock(lock#9); [T79] lock(sk_lock-AF_INET); [T79] *** DEADLOCK *** [T79] 5 locks held by kworker/2:0/79: [T79] #0: ffff88800120b158 ((wq_completion)events_long){+.+.}-{0:0}, at: process_one_work+0xfca/0x1930 [T79] #1: ffffc9000474fd00 ((work_completion)(&ctrl->ctrl_work)) {+.+.}-{0:0}, at: process_one_work+0x804/0x1930 [T79] #2: ffffffffc11307d0 (ctrl_lock){+.+.}-{4:4}, at: server_ctrl_handle_work+0x21/0x280 [ksmbd] [T79] #3: ffffffffc11347b0 (init_lock){+.+.}-{4:4}, at: ksmbd_conn_transport_init+0x18/0x70 [ksmbd] [T79] #4: ffffffffc10f7230 (lock#9){+.+.}-{4:4}, at: rdma_listen+0x3d2/0x740 [rdma_cm] [T79] stack backtrace: [T79] CPU: 2 UID: 0 PID: 79 Comm: kworker/2:0 Kdump: loaded Tainted: G OE 6.18.0-rc4-metze-kasan-lockdep.01+ #1 PREEMPT(voluntary) [T79] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [T79] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [T79] Workqueue: events_long server_ctrl_handle_work [ksmbd] ... [T79] print_circular_bug+0xfd/0x130 [T79] check_noncircular+0x150/0x170 [T79] check_prev_add+0xf3/0xcd0 [T79] validate_chain+0x466/0x590 [T79] __lock_acquire+0x535/0xc30 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] lock_acquire.part.0+0xb3/0x240 [T79] ? sock_set_reuseaddr+0x14/0x70 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? __kasan_check_write+0x14/0x30 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? apparmor_socket_post_create+0x180/0x700 [T79] lock_acquire+0x60/0x140 [T79] ? sock_set_reuseaddr+0x14/0x70 [T79] lock_sock_nested+0x3b/0xf0 [T79] ? sock_set_reuseaddr+0x14/0x70 [T79] sock_set_reuseaddr+0x14/0x70 [T79] siw_create_listen+0x145/0x1540 [siw] [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? local_clock_noinstr+0xe/0xd0 [T79] ? __pfx_siw_create_listen+0x10/0x10 [siw] [T79] ? trace_preempt_on+0x4c/0x130 [T79] ? __raw_spin_unlock_irqrestore+0x4a/0x90 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? preempt_count_sub+0x52/0x80 [T79] iw_cm_listen+0x313/0x5b0 [iw_cm] [T79] cma_iw_listen+0x271/0x3c0 [rdma_cm] [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] rdma_listen+0x3b1/0x740 [rdma_cm] [T79] ? _raw_spin_unlock+0x2c/0x60 [T79] ? __pfx_rdma_listen+0x10/0x10 [rdma_cm] [T79] ? rdma_restrack_add+0x12c/0x630 [ib_core] [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] cma_listen_on_dev+0x46a/0x750 [rdma_cm] [T79] rdma_listen+0x4b0/0x740 [rdma_cm] [T79] ? __pfx_rdma_listen+0x10/0x10 [rdma_cm] [T79] ? cma_get_port+0x30d/0x7d0 [rdma_cm] [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? rdma_bind_addr_dst+0x598/0x9a0 [rdma_cm] [T79] ksmbd_rdma_init+0x12b/0x270 [ksmbd] [T79] ? __pfx_ksmbd_rdma_init+0x10/0x10 [ksmbd] [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? register_netdevice_notifier+0x1dc/0x240 [T79] ksmbd_conn_transport_init+0x26/0x70 [ksmbd] [T79] server_ctrl_handle_work+0x1e5/0x280 [ksmbd] [T79] process_one_work+0x86c/0x1930 [T79] ? __pfx_process_one_work+0x10/0x10 [T79] ? srso_alias_return_thunk+0x5/0xfbef5 [T79] ? assign_work+0x16f/0x280 [T79] worker_thread+0x6f0/0x11f0 I was not able to reproduce this as I was testing with various runs switching siw and rxe as well as IPPROTO_SMBDIRECT sockets, while the above stack used siw with the non IPPROTO_SMBDIRECT patches [1]. Even if this patch doesn't solve the above I think it's a good idea to reclassify the sockets used by siw, I also send patches for rxe to reclassify, as well as my IPPROTO_SMBDIRECT socket patches [1] will do it, this should minimize potential false positives. [1] https://git.samba.org/?p=metze/linux/wip.git;a=shortlog;h=refs/heads/master-ipproto-smbdirect Cc: Bernard Metzler Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-cifs@vger.kernel.org Signed-off-by: Stefan Metzmacher Link: https://patch.msgid.link/20251126150842.1837072-1-metze@samba.org Acked-by: Bernard Metzler Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_cm.c | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 708b13993fdf..79dd44cd6bc3 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -39,6 +39,55 @@ static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key siw_sk_key[2]; +static struct lock_class_key siw_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void siw_reclassify_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-SIW", + &siw_slock_key[0], + "sk_lock-AF_INET-RDMA-SIW", + &siw_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-SIW", + &siw_slock_key[1], + "sk_lock-AF_INET6-RDMA-SIW", + &siw_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); @@ -1394,6 +1443,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; + siw_reclassify_socket(s); /* * NOTE: For simplification, connect() is called in blocking @@ -1770,6 +1820,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; + siw_reclassify_socket(s); /* * Allow binding local port when still in TIME_WAIT from last close. From 80a85a771deb113cfe2e295fb9e84467a43ebfe4 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 27 Nov 2025 11:56:14 +0100 Subject: [PATCH 65/65] RDMA/rxe: reclassify sockets in order to avoid false positives from lockdep While developing IPPROTO_SMBDIRECT support for the code under fs/smb/common/smbdirect [1], I noticed false positives like this: [+0,003927] ============================================ [+0,000532] WARNING: possible recursive locking detected [+0,000611] 6.18.0-rc5-metze-kasan-lockdep.02+ #1 Tainted: G OE [+0,000835] -------------------------------------------- [+0,000729] ksmbd:r5445/3609 is trying to acquire lock: [+0,000709] ffff88800b9570f8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: inet_shutdown+0x52/0x360 [+0,000831] but task is already holding lock: [+0,000684] ffff88800654af78 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: smbdirect_sk_close+0x122/0x790 [smbdirect] [+0,000928] other info that might help us debug this: [+0,005552] Possible unsafe locking scenario: [+0,000723] CPU0 [+0,000359] ---- [+0,000377] lock(k-sk_lock-AF_INET); [+0,000478] lock(k-sk_lock-AF_INET); [+0,000498] *** DEADLOCK *** [+0,001012] May be due to missing lock nesting notation [+0,000831] 3 locks held by ksmbd:r5445/3609: [+0,000484] #0: ffff88800654af78 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: smbdirect_sk_close+0x122/0x790 [smbdirect] [+0,001000] #1: ffff888020a40458 (&id_priv->handler_mutex){+.+.}-{4:4}, at: rdma_lock_handler+0x17/0x30 [rdma_cm] [+0,000982] #2: ffff888020a40350 (&id_priv->qp_mutex){+.+.}-{4:4}, at: rdma_destroy_qp+0x5d/0x1f0 [rdma_cm] [+0,000934] stack backtrace: [+0,000589] CPU: 0 UID: 0 PID: 3609 Comm: ksmbd:r5445 Kdump: loaded Tainted: G OE 6.18.0-rc5-metze-kasan-lockdep.02+ #1 PREEMPT(voluntary) [+0,000023] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [+0,000004] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 ... [+0,000010] print_deadlock_bug+0x245/0x330 [+0,000014] validate_chain+0x32a/0x590 [+0,000012] __lock_acquire+0x535/0xc30 [+0,000013] lock_acquire.part.0+0xb3/0x240 [+0,000017] ? inet_shutdown+0x52/0x360 [+0,000013] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000007] ? mark_held_locks+0x46/0x90 [+0,000012] lock_acquire+0x60/0x140 [+0,000006] ? inet_shutdown+0x52/0x360 [+0,000028] lock_sock_nested+0x3b/0xf0 [+0,000009] ? inet_shutdown+0x52/0x360 [+0,000008] inet_shutdown+0x52/0x360 [+0,000010] kernel_sock_shutdown+0x5b/0x90 [+0,000011] rxe_qp_do_cleanup+0x4ef/0x810 [rdma_rxe] [+0,000043] ? __pfx_rxe_qp_do_cleanup+0x10/0x10 [rdma_rxe] [+0,000030] execute_in_process_context+0x2b/0x170 [+0,000013] rxe_qp_cleanup+0x1c/0x30 [rdma_rxe] [+0,000021] __rxe_cleanup+0x1cf/0x2e0 [rdma_rxe] [+0,000036] ? __pfx___rxe_cleanup+0x10/0x10 [rdma_rxe] [+0,000020] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000006] ? __kasan_check_read+0x11/0x20 [+0,000012] rxe_destroy_qp+0xe1/0x230 [rdma_rxe] [+0,000035] ib_destroy_qp_user+0x217/0x450 [ib_core] [+0,000074] rdma_destroy_qp+0x83/0x1f0 [rdma_cm] [+0,000034] smbdirect_connection_destroy_qp+0x98/0x2e0 [smbdirect] [+0,000017] ? __pfx_smb_direct_logging_needed+0x10/0x10 [ksmbd] [+0,000044] smbdirect_connection_destroy+0x698/0xed0 [smbdirect] [+0,000023] ? __pfx_smbdirect_connection_destroy+0x10/0x10 [smbdirect] [+0,000033] ? __pfx_smb_direct_logging_needed+0x10/0x10 [ksmbd] [+0,000031] smbdirect_connection_destroy_sync+0x42b/0x9f0 [smbdirect] [+0,000029] ? mark_held_locks+0x46/0x90 [+0,000012] ? __pfx_smbdirect_connection_destroy_sync+0x10/0x10 [smbdirect] [+0,000019] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000007] ? trace_hardirqs_on+0x64/0x70 [+0,000029] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000010] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000006] ? __smbdirect_connection_schedule_disconnect+0x339/0x4b0 [+0,000021] smbdirect_sk_destroy+0xb0/0x680 [smbdirect] [+0,000024] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000006] ? trace_hardirqs_on+0x64/0x70 [+0,000006] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000005] ? __local_bh_enable_ip+0xba/0x150 [+0,000011] sk_common_release+0x66/0x340 [+0,000010] smbdirect_sk_close+0x12a/0x790 [smbdirect] [+0,000023] ? ip_mc_drop_socket+0x1e/0x240 [+0,000013] inet_release+0x10a/0x240 [+0,000011] smbdirect_sock_release+0x502/0xe80 [smbdirect] [+0,000015] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000024] sock_release+0x91/0x1c0 [+0,000010] smb_direct_free_transport+0x31/0x50 [ksmbd] [+0,000025] ksmbd_conn_free+0x1d0/0x240 [ksmbd] [+0,000040] smb_direct_disconnect+0xb2/0x120 [ksmbd] [+0,000023] ? srso_alias_return_thunk+0x5/0xfbef5 [+0,000018] ksmbd_conn_handler_loop+0x94e/0xf10 [ksmbd] ... I'll also add reclassify to the smbdirect socket code [1], but I think it's better to have it in both direction (below and above the RDMA layer). [1] https://git.samba.org/?p=metze/linux/wip.git;a=shortlog;h=refs/heads/master-ipproto-smbdirect Cc: Zhu Yanjun Reviewed-by: Zhu Yanjun Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-cifs@vger.kernel.org Signed-off-by: Stefan Metzmacher Link: https://patch.msgid.link/20251127105614.2040922-1-metze@samba.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_net.c | 49 +++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_qp.c | 49 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index ac0183a2ff7a..0195d361e5e3 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -20,6 +20,54 @@ static struct rxe_recv_sockets recv_sockets; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key rxe_recv_sk_key[2]; +static struct lock_class_key rxe_recv_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void rxe_reclassify_recv_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-RXE-RECV", + &rxe_recv_slock_key[0], + "sk_lock-AF_INET-RDMA-RXE-RECV", + &rxe_recv_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-RXE-RECV", + &rxe_recv_slock_key[1], + "sk_lock-AF_INET6-RDMA-RXE-RECV", + &rxe_recv_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, @@ -192,6 +240,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); + rxe_reclassify_recv_socket(sock); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 95f1c1c2949d..845bdd03ca28 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -15,6 +15,54 @@ #include "rxe_queue.h" #include "rxe_task.h" +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key rxe_send_sk_key[2]; +static struct lock_class_key rxe_send_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void rxe_reclassify_send_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-RXE-SEND", + &rxe_send_slock_key[0], + "sk_lock-AF_INET-RDMA-RXE-SEND", + &rxe_send_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-RXE-SEND", + &rxe_send_slock_key[1], + "sk_lock-AF_INET6-RDMA-RXE-SEND", + &rxe_send_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { @@ -244,6 +292,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; + rxe_reclassify_send_socket(qp->sk); qp->sk->sk->sk_user_data = qp; /* pick a source UDP port number for this QP based on