mirror of https://github.com/torvalds/linux.git
accel/amdxdna: Add error handling
When there is a hardware error, the NPU firmware notifies the host through a mailbox message. The message includes details of the error, such as the tile and column indexes where the error occurred. The driver starts a thread to handle the NPU error message. The thread stops the clients which are using the column where error occurred. Then the driver resets that column. Co-developed-by: Min Ma <min.ma@amd.com> Signed-off-by: Min Ma <min.ma@amd.com> Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118172942.2014541-10-lizhi.hou@amd.com
This commit is contained in:
parent
bed4c73e59
commit
4fd4ca984b
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
amdxdna-y := \
|
||||
aie2_ctx.o \
|
||||
aie2_error.o \
|
||||
aie2_message.o \
|
||||
aie2_pci.o \
|
||||
aie2_psp.o \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,360 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
|
||||
*/
|
||||
|
||||
#include <drm/drm_cache.h>
|
||||
#include <drm/drm_device.h>
|
||||
#include <drm/drm_print.h>
|
||||
#include <drm/gpu_scheduler.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "aie2_msg_priv.h"
|
||||
#include "aie2_pci.h"
|
||||
#include "amdxdna_mailbox.h"
|
||||
#include "amdxdna_pci_drv.h"
|
||||
|
||||
struct async_event {
|
||||
struct amdxdna_dev_hdl *ndev;
|
||||
struct async_event_msg_resp resp;
|
||||
struct workqueue_struct *wq;
|
||||
struct work_struct work;
|
||||
u8 *buf;
|
||||
dma_addr_t addr;
|
||||
u32 size;
|
||||
};
|
||||
|
||||
struct async_events {
|
||||
struct workqueue_struct *wq;
|
||||
u8 *buf;
|
||||
dma_addr_t addr;
|
||||
u32 size;
|
||||
u32 event_cnt;
|
||||
struct async_event event[] __counted_by(event_cnt);
|
||||
};
|
||||
|
||||
/*
|
||||
* Below enum, struct and lookup tables are porting from XAIE util header file.
|
||||
*
|
||||
* Below data is defined by AIE device and it is used for decode error message
|
||||
* from the device.
|
||||
*/
|
||||
|
||||
enum aie_module_type {
|
||||
AIE_MEM_MOD = 0,
|
||||
AIE_CORE_MOD,
|
||||
AIE_PL_MOD,
|
||||
};
|
||||
|
||||
enum aie_error_category {
|
||||
AIE_ERROR_SATURATION = 0,
|
||||
AIE_ERROR_FP,
|
||||
AIE_ERROR_STREAM,
|
||||
AIE_ERROR_ACCESS,
|
||||
AIE_ERROR_BUS,
|
||||
AIE_ERROR_INSTRUCTION,
|
||||
AIE_ERROR_ECC,
|
||||
AIE_ERROR_LOCK,
|
||||
AIE_ERROR_DMA,
|
||||
AIE_ERROR_MEM_PARITY,
|
||||
/* Unknown is not from XAIE, added for better category */
|
||||
AIE_ERROR_UNKNOWN,
|
||||
};
|
||||
|
||||
/* Don't pack, unless XAIE side changed */
|
||||
struct aie_error {
|
||||
__u8 row;
|
||||
__u8 col;
|
||||
__u32 mod_type;
|
||||
__u8 event_id;
|
||||
};
|
||||
|
||||
struct aie_err_info {
|
||||
u32 err_cnt;
|
||||
u32 ret_code;
|
||||
u32 rsvd;
|
||||
struct aie_error payload[] __counted_by(err_cnt);
|
||||
};
|
||||
|
||||
struct aie_event_category {
|
||||
u8 event_id;
|
||||
enum aie_error_category category;
|
||||
};
|
||||
|
||||
#define EVENT_CATEGORY(id, cat) { id, cat }
|
||||
static const struct aie_event_category aie_ml_mem_event_cat[] = {
|
||||
EVENT_CATEGORY(88U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(90U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
|
||||
EVENT_CATEGORY(97U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(98U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(99U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(100U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
|
||||
};
|
||||
|
||||
static const struct aie_event_category aie_ml_core_event_cat[] = {
|
||||
EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
|
||||
EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(58U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
|
||||
EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
|
||||
EVENT_CATEGORY(62U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(64U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
|
||||
EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
|
||||
EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
|
||||
EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
|
||||
EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(72U, AIE_ERROR_BUS),
|
||||
};
|
||||
|
||||
static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
|
||||
EVENT_CATEGORY(130U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(132U, AIE_ERROR_ECC),
|
||||
EVENT_CATEGORY(133U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(134U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(138U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
|
||||
};
|
||||
|
||||
static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
|
||||
EVENT_CATEGORY(64U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
|
||||
EVENT_CATEGORY(67U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(68U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(69U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(70U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(71U, AIE_ERROR_BUS),
|
||||
EVENT_CATEGORY(72U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(73U, AIE_ERROR_DMA),
|
||||
EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
|
||||
};
|
||||
|
||||
static enum aie_error_category
|
||||
aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
|
||||
{
|
||||
const struct aie_event_category *lut;
|
||||
int num_entry;
|
||||
int i;
|
||||
|
||||
switch (mod_type) {
|
||||
case AIE_PL_MOD:
|
||||
lut = aie_ml_shim_tile_event_cat;
|
||||
num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
|
||||
break;
|
||||
case AIE_CORE_MOD:
|
||||
lut = aie_ml_core_event_cat;
|
||||
num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
|
||||
break;
|
||||
case AIE_MEM_MOD:
|
||||
if (row == 1) {
|
||||
lut = aie_ml_mem_tile_event_cat;
|
||||
num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
|
||||
} else {
|
||||
lut = aie_ml_mem_event_cat;
|
||||
num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return AIE_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_entry; i++) {
|
||||
if (event_id != lut[i].event_id)
|
||||
continue;
|
||||
|
||||
return lut[i].category;
|
||||
}
|
||||
|
||||
return AIE_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
|
||||
{
|
||||
struct aie_error *errs = err_info;
|
||||
u32 err_col = 0; /* assume that AIE has less than 32 columns */
|
||||
int i;
|
||||
|
||||
/* Get err column bitmap */
|
||||
for (i = 0; i < num_err; i++) {
|
||||
struct aie_error *err = &errs[i];
|
||||
enum aie_error_category cat;
|
||||
|
||||
cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
|
||||
XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
|
||||
err->row, err->col, err->mod_type,
|
||||
err->event_id, cat);
|
||||
|
||||
if (err->col >= 32) {
|
||||
XDNA_WARN(ndev->xdna, "Invalid column number");
|
||||
break;
|
||||
}
|
||||
|
||||
err_col |= (1 << err->col);
|
||||
}
|
||||
|
||||
return err_col;
|
||||
}
|
||||
|
||||
static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
|
||||
{
|
||||
struct async_event_msg_resp *resp;
|
||||
struct async_event *e = handle;
|
||||
|
||||
if (data) {
|
||||
resp = (struct async_event_msg_resp *)data;
|
||||
e->resp.type = resp->type;
|
||||
wmb(); /* Update status in the end, so that no lock for here */
|
||||
e->resp.status = resp->status;
|
||||
}
|
||||
queue_work(e->wq, &e->work);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aie2_error_event_send(struct async_event *e)
|
||||
{
|
||||
drm_clflush_virt_range(e->buf, e->size); /* device can access */
|
||||
return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
|
||||
aie2_error_async_cb);
|
||||
}
|
||||
|
||||
static void aie2_error_worker(struct work_struct *err_work)
|
||||
{
|
||||
struct aie_err_info *info;
|
||||
struct amdxdna_dev *xdna;
|
||||
struct async_event *e;
|
||||
u32 max_err;
|
||||
u32 err_col;
|
||||
|
||||
e = container_of(err_work, struct async_event, work);
|
||||
|
||||
xdna = e->ndev->xdna;
|
||||
|
||||
if (e->resp.status == MAX_AIE2_STATUS_CODE)
|
||||
return;
|
||||
|
||||
e->resp.status = MAX_AIE2_STATUS_CODE;
|
||||
|
||||
print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
|
||||
e->buf, 0x100, false);
|
||||
|
||||
info = (struct aie_err_info *)e->buf;
|
||||
XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
|
||||
|
||||
max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
|
||||
if (unlikely(info->err_cnt > max_err)) {
|
||||
WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
|
||||
return;
|
||||
}
|
||||
err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
|
||||
if (!err_col) {
|
||||
XDNA_WARN(xdna, "Did not get error column");
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_lock(&xdna->dev_lock);
|
||||
/* Re-sent this event to firmware */
|
||||
if (aie2_error_event_send(e))
|
||||
XDNA_WARN(xdna, "Unable to register async event");
|
||||
mutex_unlock(&xdna->dev_lock);
|
||||
}
|
||||
|
||||
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
struct async_event *e;
|
||||
int i, ret;
|
||||
|
||||
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
|
||||
for (i = 0; i < ndev->async_events->event_cnt; i++) {
|
||||
e = &ndev->async_events->event[i];
|
||||
ret = aie2_error_event_send(e);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
struct async_events *events;
|
||||
|
||||
events = ndev->async_events;
|
||||
|
||||
mutex_unlock(&xdna->dev_lock);
|
||||
destroy_workqueue(events->wq);
|
||||
mutex_lock(&xdna->dev_lock);
|
||||
|
||||
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
|
||||
events->addr, DMA_FROM_DEVICE);
|
||||
kfree(events);
|
||||
}
|
||||
|
||||
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
u32 total_col = ndev->total_col;
|
||||
u32 total_size = ASYNC_BUF_SIZE * total_col;
|
||||
struct async_events *events;
|
||||
int i, ret;
|
||||
|
||||
events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
|
||||
if (!events)
|
||||
return -ENOMEM;
|
||||
|
||||
events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
|
||||
DMA_FROM_DEVICE, GFP_KERNEL);
|
||||
if (!events->buf) {
|
||||
ret = -ENOMEM;
|
||||
goto free_events;
|
||||
}
|
||||
events->size = total_size;
|
||||
events->event_cnt = total_col;
|
||||
|
||||
events->wq = alloc_ordered_workqueue("async_wq", 0);
|
||||
if (!events->wq) {
|
||||
ret = -ENOMEM;
|
||||
goto free_buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < events->event_cnt; i++) {
|
||||
struct async_event *e = &events->event[i];
|
||||
u32 offset = i * ASYNC_BUF_SIZE;
|
||||
|
||||
e->ndev = ndev;
|
||||
e->wq = events->wq;
|
||||
e->buf = &events->buf[offset];
|
||||
e->addr = events->addr + offset;
|
||||
e->size = ASYNC_BUF_SIZE;
|
||||
e->resp.status = MAX_AIE2_STATUS_CODE;
|
||||
INIT_WORK(&e->work, aie2_error_worker);
|
||||
}
|
||||
|
||||
ndev->async_events = events;
|
||||
|
||||
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
|
||||
events->event_cnt, events->size);
|
||||
return 0;
|
||||
|
||||
free_buf:
|
||||
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
|
||||
events->addr, DMA_FROM_DEVICE);
|
||||
free_events:
|
||||
kfree(events);
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -308,6 +308,25 @@ int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u6
|
|||
return 0;
|
||||
}
|
||||
|
||||
int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
|
||||
void *handle, int (*cb)(void*, const u32 *, size_t))
|
||||
{
|
||||
struct async_event_msg_req req = { 0 };
|
||||
struct xdna_mailbox_msg msg = {
|
||||
.send_data = (u8 *)&req,
|
||||
.send_size = sizeof(req),
|
||||
.handle = handle,
|
||||
.opcode = MSG_OP_REGISTER_ASYNC_EVENT_MSG,
|
||||
.notify_cb = cb,
|
||||
};
|
||||
|
||||
req.buf_addr = addr;
|
||||
req.buf_size = size;
|
||||
|
||||
XDNA_DBG(ndev->xdna, "Register addr 0x%llx size 0x%x", addr, size);
|
||||
return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT);
|
||||
}
|
||||
|
||||
int aie2_config_cu(struct amdxdna_hwctx *hwctx)
|
||||
{
|
||||
struct mailbox_channel *chann = hwctx->priv->mbox_chann;
|
||||
|
|
|
|||
|
|
@ -180,6 +180,15 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
|
|||
return ret;
|
||||
}
|
||||
|
||||
if (!ndev->async_events)
|
||||
return 0;
|
||||
|
||||
ret = aie2_error_async_events_send(ndev);
|
||||
if (ret) {
|
||||
XDNA_ERR(ndev->xdna, "Send async events failed");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -469,9 +478,30 @@ static int aie2_init(struct amdxdna_dev *xdna)
|
|||
goto stop_hw;
|
||||
}
|
||||
|
||||
ret = aie2_error_async_events_alloc(ndev);
|
||||
if (ret) {
|
||||
XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
|
||||
goto stop_hw;
|
||||
}
|
||||
|
||||
ret = aie2_error_async_events_send(ndev);
|
||||
if (ret) {
|
||||
XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
|
||||
goto async_event_free;
|
||||
}
|
||||
|
||||
/* Issue a command to make sure firmware handled async events */
|
||||
ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
|
||||
if (ret) {
|
||||
XDNA_ERR(xdna, "Re-query firmware version failed");
|
||||
goto async_event_free;
|
||||
}
|
||||
|
||||
release_firmware(fw);
|
||||
return 0;
|
||||
|
||||
async_event_free:
|
||||
aie2_error_async_events_free(ndev);
|
||||
stop_hw:
|
||||
aie2_hw_stop(xdna);
|
||||
disable_sva:
|
||||
|
|
@ -487,8 +517,10 @@ static int aie2_init(struct amdxdna_dev *xdna)
|
|||
static void aie2_fini(struct amdxdna_dev *xdna)
|
||||
{
|
||||
struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
|
||||
struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
|
||||
|
||||
aie2_hw_stop(xdna);
|
||||
aie2_error_async_events_free(ndev);
|
||||
iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
|
||||
pci_free_irq_vectors(pdev);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -170,6 +170,7 @@ struct amdxdna_dev_hdl {
|
|||
/* Mailbox and the management channel */
|
||||
struct mailbox *mbox;
|
||||
struct mailbox_channel *mgmt_chann;
|
||||
struct async_events *async_events;
|
||||
};
|
||||
|
||||
#define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
|
||||
|
|
@ -210,6 +211,12 @@ struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct psp_config *
|
|||
int aie2_psp_start(struct psp_device *psp);
|
||||
void aie2_psp_stop(struct psp_device *psp);
|
||||
|
||||
/* aie2_error.c */
|
||||
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
|
||||
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
|
||||
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
|
||||
int aie2_error_async_msg_thread(void *data);
|
||||
|
||||
/* aie2_message.c */
|
||||
int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
|
||||
int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
|
||||
|
|
@ -224,6 +231,8 @@ int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
|
|||
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
|
||||
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
|
||||
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
|
||||
int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
|
||||
void *handle, int (*cb)(void*, const u32 *, size_t));
|
||||
int aie2_config_cu(struct amdxdna_hwctx *hwctx);
|
||||
int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
|
||||
int (*notify_cb)(void *, const u32 *, size_t));
|
||||
|
|
|
|||
Loading…
Reference in New Issue