accel/qaic: Implement basic SSR handling

Subsystem restart (SSR) for a qaic device means that a NSP has crashed,
and will be restarted.  However the restart process will lose any state
associated with activation, so the user will need to do some recovery.

While SSR has the provision to collect a crash dump, this patch does not
implement support for it.

Co-developed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Co-developed-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Signed-off-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Co-developed-by: Troy Hanson <quic_thanson@quicinc.com>
Signed-off-by: Troy Hanson <quic_thanson@quicinc.com>
Co-developed-by: Aswin Venkatesan <aswivenk@qti.qualcomm.com>
Signed-off-by: Aswin Venkatesan <aswivenk@qti.qualcomm.com>
Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Signed-off-by: Youssef Samir <youssef.abdulrahman@oss.qualcomm.com>
Signed-off-by: Zack McKevitt <zachary.mckevitt@oss.qualcomm.com>
Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
[jhugo: Fix minor checkpatch whitespace issues]
Signed-off-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
Link: https://patch.msgid.link/20251031174059.2814445-3-zachary.mckevitt@oss.qualcomm.com
This commit is contained in:
Jeffrey Hugo 2025-10-31 10:41:02 -07:00 committed by Jeff Hugo
parent f286066ed9
commit 9675093ace
7 changed files with 381 additions and 9 deletions

View File

@ -487,8 +487,8 @@ one user crashes, the fallout of that should be limited to that workload and not
impact other workloads. SSR accomplishes this.
If a particular workload crashes, QSM notifies the host via the QAIC_SSR MHI
channel. This notification identifies the workload by it's assigned DBC. A
multi-stage recovery process is then used to cleanup both sides, and get the
channel. This notification identifies the workload by its assigned DBC. A
multi-stage recovery process is then used to cleanup both sides, and gets the
DBC/NSPs into a working state.
When SSR occurs, any state in the workload is lost. Any inputs that were in
@ -496,6 +496,26 @@ process, or queued by not yet serviced, are lost. The loaded artifacts will
remain in on-card DDR, but the host will need to re-activate the workload if
it desires to recover the workload.
When SSR occurs for a specific NSP, the assigned DBC goes through the
following state transactions in order:
DBC_STATE_BEFORE_SHUTDOWN
Indicates that the affected NSP was found in an unrecoverable error
condition.
DBC_STATE_AFTER_SHUTDOWN
Indicates that the NSP is under reset.
DBC_STATE_BEFORE_POWER_UP
Indicates that the NSP's debug information has been collected, and is
ready to be collected by the host (if desired). At that stage the NSP
is restarted by QSM.
DBC_STATE_AFTER_POWER_UP
Indicates that the NSP has been restarted, fully operational and is
in idle state.
SSR also has an optional crashdump collection feature. If enabled, the host can
collect the memory dump for the crashed NSP and dump it to the user space via
the dev_coredump subsystem. The host can also decline the crashdump collection
request from the device.
Reliability, Accessibility, Serviceability (RAS)
================================================

View File

@ -11,6 +11,7 @@ qaic-y := \
qaic_data.o \
qaic_drv.o \
qaic_ras.o \
qaic_ssr.o \
qaic_sysfs.o \
qaic_timesync.o \
sahara.o

View File

@ -21,6 +21,7 @@
#define QAIC_DBC_BASE SZ_128K
#define QAIC_DBC_SIZE SZ_4K
#define QAIC_SSR_DBC_SENTINEL U32_MAX /* No ongoing SSR sentinel */
#define QAIC_NO_PARTITION -1
@ -197,6 +198,12 @@ struct qaic_device {
unsigned int ue_count;
/* Un-correctable non-fatal error count */
unsigned int ue_nf_count;
/* MHI SSR channel device */
struct mhi_device *ssr_ch;
/* Work queue for tasks related to MHI SSR device */
struct workqueue_struct *ssr_wq;
/* DBC which is under SSR. Sentinel U32_MAX would mean that no SSR in progress */
u32 ssr_dbc;
};
struct qaic_drm_device {
@ -340,6 +347,8 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
int qaic_detach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
void irq_polling_work(struct work_struct *work);
void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id);
void qaic_dbc_exit_ssr(struct qaic_device *qdev);
/* qaic_sysfs.c */
int qaic_sysfs_init(struct qaic_drm_device *qddev);

View File

@ -1047,6 +1047,11 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
goto unlock_ch_srcu;
}
if (dbc->id == qdev->ssr_dbc) {
ret = -EPIPE;
goto unlock_ch_srcu;
}
ret = qaic_prepare_bo(qdev, bo, &args->hdr);
if (ret)
goto unlock_ch_srcu;
@ -1370,6 +1375,11 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr
goto release_ch_rcu;
}
if (dbc->id == qdev->ssr_dbc) {
ret = -EPIPE;
goto release_ch_rcu;
}
ret = mutex_lock_interruptible(&dbc->req_lock);
if (ret)
goto release_ch_rcu;
@ -1722,6 +1732,11 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
goto unlock_ch_srcu;
}
if (dbc->id == qdev->ssr_dbc) {
ret = -EPIPE;
goto unlock_ch_srcu;
}
obj = drm_gem_object_lookup(file_priv, args->handle);
if (!obj) {
ret = -ENOENT;
@ -1742,6 +1757,9 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
if (!dbc->usr)
ret = -EPERM;
if (dbc->id == qdev->ssr_dbc)
ret = -EPIPE;
put_obj:
drm_gem_object_put(obj);
unlock_ch_srcu:
@ -1945,6 +1963,17 @@ static void empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *db
spin_unlock_irqrestore(&dbc->xfer_lock, flags);
}
static void sync_empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *dbc)
{
empty_xfer_list(qdev, dbc);
synchronize_srcu(&dbc->ch_lock);
/*
* Threads holding channel lock, may add more elements in the xfer_list.
* Flush out these elements from xfer_list.
*/
empty_xfer_list(qdev, dbc);
}
int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr)
{
if (!qdev->dbc[dbc_id].usr || qdev->dbc[dbc_id].usr->handle != usr->handle)
@ -1973,13 +2002,7 @@ void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id)
struct dma_bridge_chan *dbc = &qdev->dbc[dbc_id];
dbc->usr = NULL;
empty_xfer_list(qdev, dbc);
synchronize_srcu(&dbc->ch_lock);
/*
* Threads holding channel lock, may add more elements in the xfer_list.
* Flush out these elements from xfer_list.
*/
empty_xfer_list(qdev, dbc);
sync_empty_xfer_list(qdev, dbc);
}
void release_dbc(struct qaic_device *qdev, u32 dbc_id)
@ -2020,3 +2043,30 @@ void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail)
*head = readl(dbc->dbc_base + REQHP_OFF);
*tail = readl(dbc->dbc_base + REQTP_OFF);
}
/*
* qaic_dbc_enter_ssr - Prepare to enter in sub system reset(SSR) for given DBC ID.
* @qdev: qaic device handle
* @dbc_id: ID of the DBC which will enter SSR
*
* The device will automatically deactivate the workload as not
* all errors can be silently recovered. The user will be
* notified and will need to decide the required recovery
* action to take.
*/
void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id)
{
qdev->ssr_dbc = dbc_id;
release_dbc(qdev, dbc_id);
}
/*
* qaic_dbc_exit_ssr - Prepare to exit from sub system reset(SSR) for given DBC ID.
* @qdev: qaic device handle
*
* The DBC returns to an operational state and begins accepting work after exiting SSR.
*/
void qaic_dbc_exit_ssr(struct qaic_device *qdev)
{
qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
}

View File

@ -30,6 +30,7 @@
#include "qaic.h"
#include "qaic_debugfs.h"
#include "qaic_ras.h"
#include "qaic_ssr.h"
#include "qaic_timesync.h"
#include "sahara.h"
@ -390,6 +391,7 @@ void qaic_dev_reset_clean_local_state(struct qaic_device *qdev)
qaic_notify_reset(qdev);
/* start tearing things down */
qaic_clean_up_ssr(qdev);
for (i = 0; i < qdev->num_dbc; ++i)
release_dbc(qdev, i);
}
@ -439,11 +441,18 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev,
qdev->qts_wq = qaicm_wq_init(drm, "qaic_ts");
if (IS_ERR(qdev->qts_wq))
return NULL;
qdev->ssr_wq = qaicm_wq_init(drm, "qaic_ssr");
if (IS_ERR(qdev->ssr_wq))
return NULL;
ret = qaicm_srcu_init(drm, &qdev->dev_lock);
if (ret)
return NULL;
ret = qaic_ssr_init(qdev);
if (ret)
pci_info(pdev, "QAIC SSR crashdump collection not supported.\n");
qdev->qddev = qddev;
qdev->pdev = pdev;
qddev->qdev = qdev;
@ -799,9 +808,16 @@ static int __init qaic_init(void)
ret = qaic_ras_register();
if (ret)
pr_debug("qaic: qaic_ras_register failed %d\n", ret);
ret = qaic_ssr_register();
if (ret) {
pr_debug("qaic: qaic_ssr_register failed %d\n", ret);
goto free_bootlog;
}
return 0;
free_bootlog:
qaic_bootlog_unregister();
free_mhi:
mhi_driver_unregister(&qaic_mhi_driver);
free_pci:
@ -827,6 +843,7 @@ static void __exit qaic_exit(void)
* reinitializing the link_up state after the cleanup is done.
*/
link_up = true;
qaic_ssr_unregister();
qaic_ras_unregister();
qaic_bootlog_unregister();
qaic_timesync_deinit();

View File

@ -0,0 +1,259 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. */
/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
#include <asm/byteorder.h>
#include <drm/drm_file.h>
#include <drm/drm_managed.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/mhi.h>
#include <linux/workqueue.h>
#include "qaic.h"
#include "qaic_ssr.h"
#define SSR_RESP_MSG_SZ 32
#define DEBUG_TRANSFER_INFO BIT(0)
#define DEBUG_TRANSFER_INFO_RSP BIT(1)
#define MEMORY_READ BIT(2)
#define MEMORY_READ_RSP BIT(3)
#define DEBUG_TRANSFER_DONE BIT(4)
#define DEBUG_TRANSFER_DONE_RSP BIT(5)
#define SSR_EVENT BIT(8)
#define SSR_EVENT_RSP BIT(9)
#define SSR_EVENT_NACK BIT(0)
#define BEFORE_SHUTDOWN BIT(1)
#define AFTER_SHUTDOWN BIT(2)
#define BEFORE_POWER_UP BIT(3)
#define AFTER_POWER_UP BIT(4)
struct _ssr_hdr {
__le32 cmd;
__le32 len;
__le32 dbc_id;
};
struct ssr_hdr {
u32 cmd;
u32 len;
u32 dbc_id;
};
struct ssr_debug_transfer_info_rsp {
struct _ssr_hdr hdr;
__le32 ret;
} __packed;
struct ssr_event {
struct ssr_hdr hdr;
u32 event;
} __packed;
struct ssr_event_rsp {
struct _ssr_hdr hdr;
__le32 event;
} __packed;
struct ssr_resp {
/* Work struct to schedule work coming on QAIC_SSR channel */
struct work_struct work;
/* Root struct of device, used to access device resources */
struct qaic_device *qdev;
/* Buffer used by MHI for transfer requests */
u8 data[] __aligned(8);
};
void qaic_clean_up_ssr(struct qaic_device *qdev)
{
qaic_dbc_exit_ssr(qdev);
}
static void ssr_worker(struct work_struct *work)
{
struct ssr_resp *resp = container_of(work, struct ssr_resp, work);
struct ssr_hdr *hdr = (struct ssr_hdr *)resp->data;
struct ssr_debug_transfer_info_rsp *debug_rsp;
struct qaic_device *qdev = resp->qdev;
struct ssr_event_rsp *event_rsp;
struct dma_bridge_chan *dbc;
struct ssr_event *event;
u32 ssr_event_ack;
int ret;
le32_to_cpus(&hdr->cmd);
le32_to_cpus(&hdr->len);
le32_to_cpus(&hdr->dbc_id);
if (hdr->len > SSR_RESP_MSG_SZ)
goto out;
if (hdr->dbc_id >= qdev->num_dbc)
goto out;
dbc = &qdev->dbc[hdr->dbc_id];
switch (hdr->cmd) {
case DEBUG_TRANSFER_INFO:
/* Decline crash dump request from the device */
debug_rsp = kmalloc(sizeof(*debug_rsp), GFP_KERNEL);
if (!debug_rsp)
break;
debug_rsp->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_INFO_RSP);
debug_rsp->hdr.len = cpu_to_le32(sizeof(*debug_rsp));
debug_rsp->hdr.dbc_id = cpu_to_le32(event->hdr.dbc_id);
debug_rsp->ret = cpu_to_le32(1);
ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE,
debug_rsp, sizeof(*debug_rsp), MHI_EOT);
if (ret) {
pci_warn(qdev->pdev, "Could not send DEBUG_TRANSFER_INFO_RSP %d\n", ret);
kfree(debug_rsp);
}
return;
case SSR_EVENT:
event = (struct ssr_event *)hdr;
le32_to_cpus(&event->event);
ssr_event_ack = event->event;
switch (event->event) {
case BEFORE_SHUTDOWN:
set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_SHUTDOWN);
qaic_dbc_enter_ssr(qdev, hdr->dbc_id);
break;
case AFTER_SHUTDOWN:
set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_SHUTDOWN);
break;
case BEFORE_POWER_UP:
set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_POWER_UP);
break;
case AFTER_POWER_UP:
set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_POWER_UP);
break;
default:
break;
}
event_rsp = kmalloc(sizeof(*event_rsp), GFP_KERNEL);
if (!event_rsp)
break;
event_rsp->hdr.cmd = cpu_to_le32(SSR_EVENT_RSP);
event_rsp->hdr.len = cpu_to_le32(sizeof(*event_rsp));
event_rsp->hdr.dbc_id = cpu_to_le32(hdr->dbc_id);
event_rsp->event = cpu_to_le32(ssr_event_ack);
ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, event_rsp, sizeof(*event_rsp),
MHI_EOT);
if (ret)
kfree(event_rsp);
if (event->event == AFTER_POWER_UP) {
qaic_dbc_exit_ssr(qdev);
set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_IDLE);
}
break;
default:
break;
}
out:
ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
if (ret)
kfree(resp);
}
static int qaic_ssr_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
{
struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
struct ssr_resp *resp;
int ret;
ret = mhi_prepare_for_transfer(mhi_dev);
if (ret)
return ret;
resp = kzalloc(sizeof(*resp) + SSR_RESP_MSG_SZ, GFP_KERNEL);
if (!resp) {
mhi_unprepare_from_transfer(mhi_dev);
return -ENOMEM;
}
resp->qdev = qdev;
INIT_WORK(&resp->work, ssr_worker);
ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
if (ret) {
kfree(resp);
mhi_unprepare_from_transfer(mhi_dev);
return ret;
}
dev_set_drvdata(&mhi_dev->dev, qdev);
qdev->ssr_ch = mhi_dev;
return 0;
}
static void qaic_ssr_mhi_remove(struct mhi_device *mhi_dev)
{
struct qaic_device *qdev;
qdev = dev_get_drvdata(&mhi_dev->dev);
mhi_unprepare_from_transfer(qdev->ssr_ch);
qdev->ssr_ch = NULL;
}
static void qaic_ssr_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
{
kfree(mhi_result->buf_addr);
}
static void qaic_ssr_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
{
struct ssr_resp *resp = container_of(mhi_result->buf_addr, struct ssr_resp, data);
struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
if (mhi_result->transaction_status) {
kfree(resp);
return;
}
queue_work(qdev->ssr_wq, &resp->work);
}
static const struct mhi_device_id qaic_ssr_mhi_match_table[] = {
{ .chan = "QAIC_SSR", },
{},
};
static struct mhi_driver qaic_ssr_mhi_driver = {
.id_table = qaic_ssr_mhi_match_table,
.remove = qaic_ssr_mhi_remove,
.probe = qaic_ssr_mhi_probe,
.ul_xfer_cb = qaic_ssr_mhi_ul_xfer_cb,
.dl_xfer_cb = qaic_ssr_mhi_dl_xfer_cb,
.driver = {
.name = "qaic_ssr",
},
};
int qaic_ssr_init(struct qaic_device *qdev)
{
qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
return 0;
}
int qaic_ssr_register(void)
{
return mhi_driver_register(&qaic_ssr_mhi_driver);
}
void qaic_ssr_unregister(void)
{
mhi_driver_unregister(&qaic_ssr_mhi_driver);
}

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0-only
*
* Copyright (c) 2020, The Linux Foundation. All rights reserved.
* Copyright (c) 2021, 2024 Qualcomm Innovation Center, Inc. All rights reserved.
*/
#ifndef __QAIC_SSR_H__
#define __QAIC_SSR_H__
struct qaic_device;
int qaic_ssr_register(void);
void qaic_ssr_unregister(void);
void qaic_clean_up_ssr(struct qaic_device *qdev);
int qaic_ssr_init(struct qaic_device *qdev);
#endif /* __QAIC_SSR_H__ */