devlink: Make health reporter burst period configurable

Enable configuration of the burst period — a time window starting
from the first error recovery, during which the reporter allows
recovery attempts for each reported error.

This feature is helpful when a single underlying issue causes multiple
errors, as it delays the start of the grace period to allow sufficient
time for recovering all related errors. For example, if multiple TX
queues time out simultaneously, a sufficient burst period could allow
all affected TX queues to be recovered within that window. Without this
period, only the first TX queue that reports a timeout will undergo
recovery, while the remaining TX queues will be blocked once the grace
period begins.

Configuration example:
$ devlink health set pci/0000:00:09.0 reporter tx burst_period 500

Configuration example with ynl:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/devlink.yaml \
 --do health-reporter-set --json '{
  "bus-name": "auxiliary",
  "dev-name": "mlx5_core.eth.0",
  "port-index": 65535,
  "health-reporter-name": "tx",
  "health-reporter-burst-period": 500
}'

Signed-off-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Shahar Shitrit 2025-08-24 11:43:53 +03:00 committed by Jakub Kicinski
parent 6a06d8c405
commit da0e219764
5 changed files with 39 additions and 5 deletions

View File

@ -853,6 +853,10 @@ attribute-sets:
type: nest
multi-attr: true
nested-attributes: dl-rate-tc-bws
-
name: health-reporter-burst-period
type: u64
doc: Time (in msec) for recoveries before starting the grace period.
-
name: dl-dev-stats
subset-of: devlink
@ -1216,6 +1220,8 @@ attribute-sets:
name: health-reporter-dump-ts-ns
-
name: health-reporter-auto-dump
-
name: health-reporter-burst-period
-
name: dl-attr-stats
@ -1961,6 +1967,7 @@ operations:
- health-reporter-graceful-period
- health-reporter-auto-recover
- health-reporter-auto-dump
- health-reporter-burst-period
-
name: health-reporter-recover

View File

@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions:
* Auto recovery attempt is being done. Depends on:
- Auto-recovery configuration
- Grace period vs. time passed since last recover
- Grace period (and burst period) vs. time passed since last recover
Devlink formatted message
=========================

View File

@ -636,6 +636,8 @@ enum devlink_attr {
DEVLINK_ATTR_RATE_TC_BWS, /* nested */
DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */
/* Add new attributes above here, update the spec in
* Documentation/netlink/specs/devlink.yaml and re-generate
* net/devlink/netlink_gen.c.

View File

@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink,
if (WARN_ON(ops->default_graceful_period && !ops->recover))
return ERR_PTR(-EINVAL);
if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
if (!reporter)
return ERR_PTR(-ENOMEM);
@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
reporter->graceful_period))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
reporter->burst_period))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
reporter->auto_recover))
@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
if (!reporter->ops->recover &&
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
return -EOPNOTSUPP;
if (!reporter->ops->dump &&
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
return -EOPNOTSUPP;
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
reporter->graceful_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
if (!reporter->graceful_period)
reporter->burst_period = 0;
}
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
u64 burst_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
if (!reporter->graceful_period && burst_period) {
NL_SET_ERR_MSG_MOD(info->extack,
"Cannot set burst period without a grace period.");
return -EINVAL;
}
reporter->burst_period = burst_period;
}
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
reporter->auto_recover =

View File

@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN
};
/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = {
static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
};
/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_health_reporter_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_health_reporter_set_nl_policy,
.maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
.maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{