From a52b1a71120bac2cac691f11b8fd2f64254570f0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Mon, 22 Sep 2025 22:48:57 +0000 Subject: [PATCH 01/65] vfio: selftests: Store libvfio build outputs in $(OUTPUT)/libvfio Store the tools/testing/selftests/vfio/lib outputs (e.g. object files) in $(OUTPUT)/libvfio rather than in $(OUTPUT)/lib. This is in preparation for building the VFIO selftests library into the KVM selftests (see Link below). Specifically this will avoid name conflicts between tools/testing/selftests/{vfio,kvm/lib and also avoid leaving behind empty directories under tools/testing/selftests/kvm after a make clean. Link: https://lore.kernel.org/kvm/20250912222525.2515416-2-dmatlack@google.com/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250922224857.2528737-1-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/libvfio.mk | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 5d11c3a89a28..3c0cdac30cb6 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -1,24 +1,26 @@ include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) -VFIO_DIR := $(selfdir)/vfio +LIBVFIO_SRCDIR := $(selfdir)/vfio/lib -LIBVFIO_C := lib/vfio_pci_device.c -LIBVFIO_C += lib/vfio_pci_driver.c +LIBVFIO_C := vfio_pci_device.c +LIBVFIO_C += vfio_pci_driver.c ifeq ($(ARCH:x86_64=x86),x86) -LIBVFIO_C += lib/drivers/ioat/ioat.c -LIBVFIO_C += lib/drivers/dsa/dsa.c +LIBVFIO_C += drivers/ioat/ioat.c +LIBVFIO_C += drivers/dsa/dsa.c endif -LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) +LIBVFIO_OUTPUT := $(OUTPUT)/libvfio + +LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) $(shell mkdir -p $(LIBVFIO_O_DIRS)) -CFLAGS += -I$(VFIO_DIR)/lib/include +CFLAGS += -I$(LIBVFIO_SRCDIR)/include -$(LIBVFIO_O): $(OUTPUT)/%.o : $(VFIO_DIR)/%.c +$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -EXTRA_CLEAN += $(LIBVFIO_O) +EXTRA_CLEAN += $(LIBVFIO_OUTPUT) From abe7d636403b7cbdb8d2d5bab253cfe8087faa7a Mon Sep 17 00:00:00 2001 From: Chu Guangqing Date: Wed, 15 Oct 2025 09:59:54 +0800 Subject: [PATCH 02/65] vfio/mtty: Fix spelling typo in samples/vfio-mdev The comment incorrectly used "atleast" instead of "at least". Signed-off-by: Chu Guangqing Link: https://lore.kernel.org/r/20251015015954.2363-2-chuguangqing@inspur.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 59eefe2fed10..6cb3e5974990 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -624,7 +624,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, u8 lsr = 0; mutex_lock(&mdev_state->rxtx_lock); - /* atleast one char in FIFO */ + /* at least one char in FIFO */ if (mdev_state->s[index].rxtx.head != mdev_state->s[index].rxtx.tail) lsr |= UART_LSR_DR; From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: [PATCH 03/65] crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/crypto/hisilicon/qm.c | 27 +++++++++++++++++++++++++++ include/linux/hisi_acc_qm.h | 3 +++ 2 files changed, 30 insertions(+) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index a5b96adf2d1e..e88085c2cbb3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3019,11 +3019,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5725,6 +5750,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5863,6 +5889,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c4690e365ade..ca1ec437a3ca 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ From 2131c1517f3004da208b7f1a3b06b8119172e194 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:44 +0800 Subject: [PATCH 04/65] hisi_acc_vfio_pci: adapt to new migration configuration On new platforms greater than QM_HW_V3, the migration region has been relocated from the VF to the PF. The VF's own configuration space is restored to the complete 64KB, and there is no need to divide the size of the BAR configuration space equally. The driver should be modified accordingly to adapt to the new hardware device. On the older hardware platform QM_HW_V3, the live migration configuration region is placed in the latter 32K portion of the VF's BAR2 configuration space. On the new hardware platform QM_HW_V4, the live migration configuration region also exists in the same 32K area immediately following the VF's BAR2, just like on QM_HW_V3. However, access to this region is now controlled by hardware. Additionally, a copy of the live migration configuration region is present in the PF's BAR2 configuration space. On the new hardware platform QM_HW_V4, when an older version of the driver is loaded, it behaves like QM_HW_V3 and uses the configuration region in the VF, ensuring that the live migration function continues to work normally. When the new version of the driver is loaded, it directly uses the configuration region in the PF. Meanwhile, hardware configuration disables the live migration configuration region in the VF's BAR2: reads return all 0xF values, and writes are silently ignored. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20251030015744.131771-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 130 +++++++++++++----- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 23 +++- 2 files changed, 114 insertions(+), 39 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99e..498cb7d1c9e5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1330,7 +1391,6 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; unsigned long minsz; @@ -1345,12 +1405,7 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info.size = hisi_acc_get_resource_len(vdev, info.index); info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | @@ -1521,7 +1576,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc1..cd55eba64dfb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating From 2f03f21fe7516902283b135de272d3c7b10672de Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Fri, 31 Oct 2025 17:06:02 +0000 Subject: [PATCH 05/65] vfio: Fix ksize arg while copying user struct in vfio_df_ioctl_bind_iommufd() For the cases where user includes a non-zero value in 'token_uuid_ptr' field of 'struct vfio_device_bind_iommufd', the copy_struct_from_user() in vfio_df_ioctl_bind_iommufd() fails with -E2BIG. For the 'minsz' passed, copy_struct_from_user() expects the newly introduced field to be zero-ed, which would be incorrect in this case. Fix this by passing the actual size of the kernel struct. If working with a newer userspace, copy_struct_from_user() would copy the 'token_uuid_ptr' field, and if working with an old userspace, it would zero out this field, thus still retaining backward compatibility. Fixes: 86624ba3b522 ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") Cc: stable@vger.kernel.org Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251031170603.2260022-2-rananta@google.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 480cac3a0c27..8ceca24ac136 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, return ret; if (user_size < minsz) return -EINVAL; - ret = copy_struct_from_user(&bind, minsz, arg, user_size); + ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size); if (ret) return ret; From 0ed3a30fd996cb0cac872432cf25185fda7e5316 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Fri, 31 Oct 2025 17:06:03 +0000 Subject: [PATCH 06/65] hisi_acc_vfio_pci: Add .match_token_uuid callback in hisi_acc_vfio_pci_migrn_ops The commit, <86624ba3b522> ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") accidentally ignored including the .match_token_uuid callback in the hisi_acc_vfio_pci_migrn_ops struct. Introduce the missed callback here. Fixes: 86624ba3b522 ("vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD") Cc: stable@vger.kernel.org Suggested-by: Longfang Liu Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Longfang Liu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251031170603.2260022-3-rananta@google.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 498cb7d1c9e5..fe2ffcd00d6e 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1620,6 +1620,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: [PATCH 07/65] vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 9 ++++++--- drivers/vfio/vfio_main.c | 7 +++++++ include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..1dc350003f07 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -996,9 +996,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; @@ -1132,6 +1134,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1458,7 +1461,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); + return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..a390163ce706 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1296,7 +1296,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + if (!device->ops->get_region_info) + goto ioctl_fallback; + ret = device->ops->get_region_info(device, uptr); + break; + default: +ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..be5fcf8432e8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..160bc2e31ece 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, From e238f147d517681e5dcb3dc9ab149211d66b00a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:18 -0400 Subject: [PATCH 08/65] vfio/hisi: Convert to the get_region_info op Change the function signature of hisi_acc_vfio_pci_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 45 +++++++++---------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fe2ffcd00d6e..30c438595eda 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1385,37 +1385,33 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct vfio_region_info info; + unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); + minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info.index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, arg); - info.size = hisi_acc_get_resource_len(vdev, info.index); + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info.size = hisi_acc_get_resource_len(vdev, info.index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1613,7 +1609,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, From c044eefa47864fb436254cb330e8d90cb6a3a870 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:19 -0400 Subject: [PATCH 09/65] vfio/virtio: Convert to the get_region_info op Remove virtiovf_vfio_pci_core_ioctl() and change the signature of virtiovf_pci_ioctl_get_region_info(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/common.h | 4 +--- drivers/vfio/pci/virtio/legacy_io.c | 20 ++++---------------- drivers/vfio/pci/virtio/main.c | 3 ++- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386..a10f2d92cb62 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info __user *arg); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c..d735d5c4bd77 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,15 +281,14 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info __user *arg) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -301,20 +300,9 @@ int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, info.size = virtvdev->bar0_virtual_buf_size; info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); } } diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f..92b525e52abe 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -108,7 +108,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, From 5ac7206474777dff56f79f1d6bc9973e988f7587 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:20 -0400 Subject: [PATCH 10/65] vfio/nvgrace: Convert to the get_region_info op Change the signature of nvgrace_gpu_ioctl_get_region_info() Reviewed-by: Kevin Tian Reviewed-by: Ankit Agrawal Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..d3a5253473e0 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,9 +205,9 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static long +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) + struct vfio_region_info __user *arg) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, @@ -220,7 +220,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -232,8 +232,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, */ memregion = nvgrace_gpu_memregion(info.index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); size = struct_size(sparse, areas, 1); @@ -285,16 +284,13 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -690,6 +686,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, From f3fddb71dd50ed31ae474238551e8623a1bc16db Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:21 -0400 Subject: [PATCH 11/65] vfio/pci: Fill in the missing get_region_info ops Now that every variant driver provides a get_region_info op remove the ioctl based dispatch from vfio_pci_core_ioctl(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 1 + drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 2 -- drivers/vfio/pci/virtio/main.c | 2 ++ 8 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 30c438595eda..1fd3a2075ee4 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1631,6 +1631,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e..b7f941f8047e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index d3a5253473e0..cab743a30dc3 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -707,6 +707,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f67..1946bc75d99b 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2..8452d9c1d11d 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..2d9122efc10b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1dc350003f07..f21d9026068c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1460,8 +1460,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 92b525e52abe..d68096bc5252 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -131,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, From 0787755271096e6c48019f44aea6ccc33f93bf41 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:22 -0400 Subject: [PATCH 12/65] vfio/mtty: Provide a get_region_info op Move it out of mtty_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 53 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 6cb3e5974990..05a65e627679 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1785,6 +1785,34 @@ static int mtty_get_device_info(struct vfio_device_info *dev_info) return 0; } +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info info; + void *cap_type = NULL; + u16 cap_type_id = 0; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); + if (ret) + return ret; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1817,30 +1845,6 @@ static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1949,6 +1953,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, + .get_region_info = mtty_ioctl_get_region_info, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, From cf16acc0af09a12d46e972f42186d94931325ea8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:23 -0400 Subject: [PATCH 13/65] vfio/mdpy: Provide a get_region_info op Move it out of mdpy_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mdpy.c | 53 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 8104831ae125..0c65ed221738 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -512,6 +512,34 @@ static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, return 0; } +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info info; + void *cap_type = NULL; + u16 cap_type_id = 0; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); + if (ret) + return ret; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -544,30 +572,6 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -665,6 +669,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, + .get_region_info = mdpy_ioctl_get_region_info, .mmap = mdpy_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 8339fccda83772ff1b035f848d1f992410c61c57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:24 -0400 Subject: [PATCH 14/65] vfio/mbochs: Provide a get_region_info op Move it out of mbochs_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mbochs.c | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 18623ba666e3..7f889b31fa2c 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1185,13 +1185,42 @@ static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) return dma_buf_fd(dmabuf->buf, 0); } +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info_ext info; + unsigned long minsz, outsz; + int ret; + + minsz = offsetofend(typeof(info), base.offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + outsz = info.base.argsz; + if (outsz < minsz) + return -EINVAL; + if (outsz > sizeof(info)) + return -EINVAL; + + ret = mbochs_get_region_info(mdev_state, &info); + if (ret) + return ret; + + if (copy_to_user(arg, &info, outsz)) + return -EFAULT; + return 0; +} + static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); int ret = 0; - unsigned long minsz, outsz; + unsigned long minsz; switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -1215,30 +1244,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info_ext info; - - minsz = offsetofend(typeof(info), base.offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - outsz = info.base.argsz; - if (outsz < minsz) - return -EINVAL; - if (outsz > sizeof(info)) - return -EINVAL; - - ret = mbochs_get_region_info(mdev_state, &info); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, outsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1376,6 +1381,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, + .get_region_info = mbochs_ioctl_get_region_info, .mmap = mbochs_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From d4635df279f57de1dd301b864724a930551028b2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:25 -0400 Subject: [PATCH 15/65] vfio/platform: Provide a get_region_info op Move it out of vfio_platform_ioctl() and re-indent it. Add it to all platform drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Reviewed-by: Eric Auger Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/9-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 1 + drivers/vfio/platform/vfio_platform.c | 1 + drivers/vfio/platform/vfio_platform_common.c | 50 +++++++++++-------- drivers/vfio/platform/vfio_platform_private.h | 2 + 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 9f5c527baa8a..d600deaf23b6 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7..0e85c914b651 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd795..3ebd50fb78fb 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,34 @@ int vfio_platform_open_device(struct vfio_device *core_vdev) } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +328,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab516849..a6008320e77b 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,8 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); From 6cdae5d0c326f0ec8f0f50571df0374f2f0b8815 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:26 -0400 Subject: [PATCH 16/65] vfio/fsl: Provide a get_region_info op Move it out of vfio_fsl_mc_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/10-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 56 ++++++++++++++++++------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 76ccbab0e3d6..d38e51a57f07 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,37 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int +vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +180,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -589,6 +596,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, From b9827eff6b4aab8203a1aa720ae3a6d3731f1a4d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:27 -0400 Subject: [PATCH 17/65] vfio/cdx: Provide a get_region_info op Change the signature of vfio_cdx_ioctl_get_region_info() and hook it to the op. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad7686..506d849139d3 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,9 +129,11 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, struct vfio_region_info __user *arg) { + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); struct vfio_region_info info; @@ -219,8 +221,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +284,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, From 61b3f7b5a72905e45f5d7769472edea8d80b56ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:28 -0400 Subject: [PATCH 18/65] vfio/ccw: Provide a get_region_info op Move it out of vfio_ccw_mdev_ioctl() and re-indent it. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/12-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 44 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index ea532a8a4a0c..6d46e0bc76df 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -504,6 +504,31 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) private->region = NULL; } +static int +vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + struct vfio_region_info info; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = vfio_ccw_mdev_get_region_info(private, &info, arg); + if (ret) + return ret; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) @@ -532,24 +557,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -627,6 +634,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, + .get_region_info = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, From e664067b603570164794facf06686434a12cb74d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:29 -0400 Subject: [PATCH 19/65] vfio/gvt: Provide a get_region_info op Move it out of intel_vgpu_ioctl() and re-indent it. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/13-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 301 +++++++++++++++---------------- 1 file changed, 150 insertions(+), 151 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 69830a5c49d3..1feb2a28ca5f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1140,6 +1140,155 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, return func(vgpu, index, start, count, flags, data); } +static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, + struct vfio_region_info __user *arg) +{ + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + struct vfio_region_info_cap_sparse_mmap *sparse = NULL; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + struct vfio_region_info info; + unsigned long minsz; + int nr_areas = 1; + int cap_type_id; + unsigned int i; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->gvt->device_info.cfg_space_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->cfg_space.bar[info.index].size; + if (!info.size) { + info.flags = 0; + break; + } + + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; + break; + case VFIO_PCI_BAR2_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info.size = gvt_aperture_sz(vgpu->gvt); + + sparse = kzalloc(struct_size(sparse, areas, nr_areas), + GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->areas[0].offset = + PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].size = vgpu_aperture_sz(vgpu); + break; + + case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; + + gvt_dbg_core("get region info bar:%d\n", info.index); + break; + + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; + + gvt_dbg_core("get region info index:%d\n", info.index); + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; + + if (info.index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + return -EINVAL; + info.index = array_index_nospec( + info.index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + + i = info.index - VFIO_PCI_NUM_REGIONS; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->region[i].size; + info.flags = vgpu->region[i].flags; + + cap_type.type = vgpu->region[i].type; + cap_type.subtype = vgpu->region[i].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + } + } + + if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = vfio_info_add_capability( + &caps, &sparse->header, + struct_size(sparse, areas, sparse->nr_areas)); + if (ret) { + kfree(sparse); + return ret; + } + break; + default: + kfree(sparse); + return -EINVAL; + } + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + kfree(sparse); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + + kfree(caps.buf); + } + + kfree(sparse); + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, unsigned long arg) { @@ -1168,157 +1317,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned int i; - int ret; - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - int nr_areas = 1; - int cap_type_id; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - break; - case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); - - sparse = kzalloc(struct_size(sparse, areas, nr_areas), - GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); - sparse->areas[0].size = vgpu_aperture_sz(vgpu); - break; - - case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info bar:%d\n", info.index); - break; - - case VFIO_PCI_ROM_REGION_INDEX: - case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info index:%d\n", info.index); - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= VFIO_PCI_NUM_REGIONS + - vgpu->num_regions) - return -EINVAL; - info.index = - array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vgpu->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = - VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; - - cap_type.type = vgpu->region[i].type; - cap_type.subtype = vgpu->region[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } - } - - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { - switch (cap_type_id) { - case VFIO_REGION_INFO_CAP_SPARSE_MMAP: - ret = vfio_info_add_capability(&caps, - &sparse->header, - struct_size(sparse, areas, - sparse->nr_areas)); - if (ret) { - kfree(sparse); - return ret; - } - break; - default: - kfree(sparse); - return -EINVAL; - } - } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - - kfree(sparse); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -1475,6 +1473,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, + .get_region_info = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From f97859503859fd217d27fc4496f294955e197eaf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:30 -0400 Subject: [PATCH 20/65] vfio: Require drivers to implement get_region_info Remove the fallback through the ioctl callback, no drivers use this now. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/14-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a390163ce706..f056e82ba350 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1297,13 +1297,13 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (!device->ops->get_region_info) - goto ioctl_fallback; - ret = device->ops->get_region_info(device, uptr); + if (unlikely(!device->ops->get_region_info)) + ret = -EINVAL; + else + ret = device->ops->get_region_info(device, uptr); break; default: -ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: [PATCH 21/65] vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 56 +++++++++++++++++++++++++++++++++++++--- include/linux/vfio.h | 4 +++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f056e82ba350..48d034aede46 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1259,6 +1259,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + if (device->ops->get_region_info_caps) { + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, + caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)) { + ret = -EFAULT; + goto out_free; + } + } else if (device->ops->get_region_info) { + ret = device->ops->get_region_info(device, arg); + if (ret) + return ret; + } else { + return -EINVAL; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1297,10 +1348,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (unlikely(!device->ops->get_region_info)) - ret = -EINVAL; - else - ret = device->ops->get_region_info(device, uptr); + ret = vfio_get_region_info(device, uptr); break; default: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8..6311ddc83770 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); From 45f9fa18109d9dece3604bbbc810d526e424b536 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:32 -0400 Subject: [PATCH 22/65] vfio/mbochs: Convert mbochs to use vfio_info_add_capability() This driver open codes the cap chain manipulations. Instead use vfio_info_add_capability() and the get_region_info_caps() op. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/16-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mbochs.c | 75 ++++++++++++-------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 7f889b31fa2c..64ea19253ee3 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -143,11 +143,6 @@ static struct mdev_parent mbochs_parent; static atomic_t mbochs_avail_mbytes; static const struct vfio_device_ops mbochs_dev_ops; -struct vfio_region_info_ext { - struct vfio_region_info base; - struct vfio_region_info_cap_type type; -}; - struct mbochs_mode { u32 drm_format; u32 bytepp; @@ -1033,10 +1028,12 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) return 0; } -static int mbochs_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info_ext *ext) +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { - struct vfio_region_info *region_info = &ext->base; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); if (region_info->index >= MBOCHS_NUM_REGIONS) return -EINVAL; @@ -1061,20 +1058,23 @@ static int mbochs_get_region_info(struct mdev_state *mdev_state, region_info->flags = (VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE); break; - case MBOCHS_EDID_REGION_INDEX: - ext->base.argsz = sizeof(*ext); - ext->base.offset = MBOCHS_EDID_OFFSET; - ext->base.size = MBOCHS_EDID_SIZE; - ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_CAPS); - ext->base.cap_offset = offsetof(typeof(*ext), type); - ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; - ext->type.header.version = 1; - ext->type.header.next = 0; - ext->type.type = VFIO_REGION_TYPE_GFX; - ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; - break; + case MBOCHS_EDID_REGION_INDEX: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1, + .type = VFIO_REGION_TYPE_GFX, + .subtype = VFIO_REGION_SUBTYPE_GFX_EDID, + }; + + region_info->offset = MBOCHS_EDID_OFFSET; + region_info->size = MBOCHS_EDID_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + + return vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + } default: region_info->size = 0; region_info->offset = 0; @@ -1185,35 +1185,6 @@ static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) return dma_buf_fd(dmabuf->buf, 0); } -static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info_ext info; - unsigned long minsz, outsz; - int ret; - - minsz = offsetofend(typeof(info), base.offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - outsz = info.base.argsz; - if (outsz < minsz) - return -EINVAL; - if (outsz > sizeof(info)) - return -EINVAL; - - ret = mbochs_get_region_info(mdev_state, &info); - if (ret) - return ret; - - if (copy_to_user(arg, &info, outsz)) - return -EFAULT; - return 0; -} - static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1381,7 +1352,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, - .get_region_info = mbochs_ioctl_get_region_info, + .get_region_info_caps = mbochs_ioctl_get_region_info, .mmap = mbochs_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 93165757c023940dc5e32a3b64a750fb6767bd8a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:33 -0400 Subject: [PATCH 23/65] vfio/gvt: Convert to get_region_info_caps Remove the duplicate code and change info to a pointer. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/17-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 113 ++++++++++++------------------- 1 file changed, 42 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 1feb2a28ca5f..96d23717684f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1141,56 +1141,46 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, } static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); - struct vfio_region_info info; - unsigned long minsz; int nr_areas = 1; int cap_type_id; unsigned int i; int ret; - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->gvt->device_info.cfg_space_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->cfg_space.bar[info->index].size; + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; break; case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info->size = gvt_aperture_sz(vgpu->gvt); sparse = kzalloc(struct_size(sparse, areas, nr_areas), GFP_KERNEL); @@ -1207,20 +1197,20 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, break; case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; - gvt_dbg_core("get region info bar:%d\n", info.index); + gvt_dbg_core("get region info bar:%d\n", info->index); break; case VFIO_PCI_ROM_REGION_INDEX: case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; - gvt_dbg_core("get region info index:%d\n", info.index); + gvt_dbg_core("get region info index:%d\n", info->index); break; default: { struct vfio_region_info_cap_type cap_type = { @@ -1228,32 +1218,32 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->region[i].size; + info->flags = vgpu->region[i].flags; cap_type.type = vgpu->region[i].type; cap_type.subtype = vgpu->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; } } - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { switch (cap_type_id) { case VFIO_REGION_INFO_CAP_SPARSE_MMAP: ret = vfio_info_add_capability( - &caps, &sparse->header, + caps, &sparse->header, struct_size(sparse, areas, sparse->nr_areas)); if (ret) { kfree(sparse); @@ -1266,27 +1256,8 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, } } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - kfree(sparse); - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, @@ -1473,7 +1444,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, - .get_region_info = intel_vgpu_ioctl_get_region_info, + .get_region_info_caps = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 973af0c40eaf5cf201d22f1cf6b50587d0138703 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:34 -0400 Subject: [PATCH 24/65] vfio/ccw: Convert to get_region_info_caps Remove the duplicate code and flatten the call chain. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/18-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 55 +++++---------------------------- 1 file changed, 7 insertions(+), 48 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6d46e0bc76df..a596f6013019 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, - struct vfio_region_info *info, - unsigned long arg) +static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); int i; switch (info->index) { @@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, return 0; default: /* all other regions are handled via capability chain */ { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_type cap_type = { .header.id = VFIO_REGION_INFO_CAP_TYPE, .header.version = 1 }; @@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, cap_type.type = private->region[i].type; cap_type.subtype = private->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; - - info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info->argsz < sizeof(*info) + caps.size) { - info->argsz = sizeof(*info) + caps.size; - info->cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(*info)); - if (copy_to_user((void __user *)arg + sizeof(*info), - caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info->cap_offset = sizeof(*info); - } - - kfree(caps.buf); - } } return 0; @@ -504,31 +488,6 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) private->region = NULL; } -static int -vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct vfio_ccw_private *private = - container_of(vdev, struct vfio_ccw_private, vdev); - struct vfio_region_info info; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; -} - static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) @@ -634,7 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, - .get_region_info = vfio_ccw_mdev_ioctl_get_region_info, + .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: [PATCH 25/65] vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 30 ++--- drivers/vfio/pci/mlx5/main.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 51 ++------- drivers/vfio/pci/pds/vfio_dev.c | 2 +- drivers/vfio/pci/qat/main.c | 2 +- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 103 +++++++----------- drivers/vfio/pci/virtio/common.h | 3 +- drivers/vfio/pci/virtio/legacy_io.c | 26 ++--- drivers/vfio/pci/virtio/main.c | 6 +- include/linux/vfio_pci_core.h | 3 +- 11 files changed, 80 insertions(+), 150 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 1fd3a2075ee4..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, } static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - if (info.argsz < minsz) - return -EINVAL; + info->size = hisi_acc_get_resource_len(vdev, info->index); - if (info.index != VFIO_PCI_BAR2_REGION_INDEX) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - - info.size = hisi_acc_get_resource_len(vdev, info.index); - - info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = hisi_acc_vfio_ioctl_get_region, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1631,7 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b7f941f8047e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index cab743a30dc3..5a6f77d5c81e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,34 +205,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static int -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -251,40 +242,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, @@ -686,7 +659,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, - .get_region_info = nvgrace_gpu_ioctl_get_region_info, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -707,7 +680,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 1946bc75d99b..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 8452d9c1d11d..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d9122efc10b..a3e49d42c771 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f21d9026068c..57c0766fb9f8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -997,43 +997,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, } int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1045,9 +1037,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1057,16 +1049,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1075,10 +1068,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1087,52 +1080,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index a10f2d92cb62..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index d735d5c4bd77..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_region_info info = {}; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d68096bc5252..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = virtiovf_pci_ioctl_get_region_info, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 160bc2e31ece..e74f94c17fbe 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, From 182c62861ba5a2301c7178484d2f424aaae4283b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:36 -0400 Subject: [PATCH 26/65] vfio/platform: Convert to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 2 +- drivers/vfio/platform/vfio_platform.c | 2 +- drivers/vfio/platform/vfio_platform_common.c | 24 ++++++------------- drivers/vfio/platform/vfio_platform_private.h | 3 ++- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index d600deaf23b6..fa754f203b2d 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,7 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 0e85c914b651..a4d3ace3e02d 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,7 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3ebd50fb78fb..c2990b7e900f 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -273,30 +273,20 @@ int vfio_platform_open_device(struct vfio_device *core_vdev) EXPORT_SYMBOL_GPL(vfio_platform_open_device); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) + if (info->index >= vdev->num_regions) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index a6008320e77b..05084212a76e 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -86,7 +86,8 @@ void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); From dc10734610e29bb6464498e933fc0d622227153d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:37 -0400 Subject: [PATCH 27/65] vfio: Move the remaining drivers to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/21-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/main.c | 24 +++++++------------ drivers/vfio/fsl-mc/vfio_fsl_mc.c | 29 +++++++---------------- samples/vfio-mdev/mdpy.c | 39 ++++++------------------------- samples/vfio-mdev/mtty.c | 38 +++++------------------------- 4 files changed, 29 insertions(+), 101 deletions(-) diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 506d849139d3..253031b86b60 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -130,29 +130,21 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, } static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_cdx_device *vdev = container_of(core_vdev, struct vfio_cdx_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -284,7 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, - .get_region_info = vfio_cdx_ioctl_get_region_info, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index d38e51a57f07..ba47100f28c1 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,34 +117,21 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } -static int -vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_fsl_mc_device *vdev = container_of(core_vdev, struct vfio_fsl_mc_device, vdev); struct fsl_mc_device *mc_dev = vdev->mc_dev; - struct vfio_region_info info; - unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) + if (info->index >= mc_dev->obj_desc.region_count) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; return 0; } @@ -596,7 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, - .get_region_info = vfio_fsl_mc_ioctl_get_region_info, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 0c65ed221738..0759bd68edca 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -435,10 +435,13 @@ static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) return remap_vmalloc_range(vma, mdev_state->memblk, 0); } -static int mdpy_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + if (region_info->index >= VFIO_PCI_NUM_REGIONS && region_info->index != MDPY_DISPLAY_REGION) return -EINVAL; @@ -512,34 +515,6 @@ static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, return 0; } -static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info info; - void *cap_type = NULL; - u16 cap_type_id = 0; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); - if (ret) - return ret; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; - return 0; -} - static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -669,7 +644,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, - .get_region_info = mdpy_ioctl_get_region_info, + .get_region_info_caps = mdpy_ioctl_get_region_info, .mmap = mdpy_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 05a65e627679..bd92c38379b8 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1717,10 +1717,12 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, return ret; } -static int mtty_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int size = 0; u32 bar_index; @@ -1785,34 +1787,6 @@ static int mtty_get_device_info(struct vfio_device_info *dev_info) return 0; } -static int mtty_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info info; - void *cap_type = NULL; - u16 cap_type_id = 0; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); - if (ret) - return ret; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; - return 0; -} - static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1953,7 +1927,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, - .get_region_info = mtty_ioctl_get_region_info, + .get_region_info_caps = mtty_ioctl_get_region_info, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: [PATCH 28/65] vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 48 ++++++++++++++++++---------------------- include/linux/vfio.h | 2 -- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 48d034aede46..b8fe1a75e48a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1267,42 +1267,36 @@ static long vfio_get_region_info(struct vfio_device *device, struct vfio_info_cap caps = {}; int ret; + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (device->ops->get_region_info_caps) { - ret = device->ops->get_region_info_caps(device, &info, &caps); - if (ret) - goto out_free; + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, - caps.size)) { - ret = -EFAULT; - goto out_free; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; } + info.cap_offset = sizeof(info); } + } - if (copy_to_user(arg, &info, minsz)) { - ret = -EFAULT; - goto out_free; - } - } else if (device->ops->get_region_info) { - ret = device->ops->get_region_info(device, arg); - if (ret) - return ret; - } else { - return -EINVAL; + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; } out_free: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc83770..8e1ddb48b9b5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); From f58ef9d1d1355b15443719df95081f193067ab88 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:20 +0200 Subject: [PATCH 29/65] PCI/P2PDMA: Separate the mmap() support from the core logic Currently the P2PDMA code requires a pgmap and a struct page to function. The was serving three important purposes: - DMA API compatibility, where scatterlist required a struct page as input - Life cycle management, the percpu_ref is used to prevent UAF during device hot unplug - A way to get the P2P provider data through the pci_p2pdma_pagemap The DMA API now has a new flow, and has gained phys_addr_t support, so it no longer needs struct pages to perform P2P mapping. Lifecycle management can be delegated to the user, DMABUF for instance has a suitable invalidation protocol that does not require struct page. Finding the P2P provider data can also be managed by the caller without need to look it up from the phys_addr. Split the P2PDMA code into two layers. The optional upper layer, effectively, provides a way to mmap() P2P memory into a VMA by providing struct page, pgmap, a genalloc and sysfs. The lower layer provides the actual P2P infrastructure and is wrapped up in a new struct p2pdma_provider. Rework the mmap layer to use new p2pdma_provider based APIs. Drivers that do not want to put P2P memory into VMA's can allocate a struct p2pdma_provider after probe() starts and free it before remove() completes. When DMA mapping the driver must convey the struct p2pdma_provider to the DMA mapping code along with a phys_addr of the MMIO BAR slice to map. The driver must ensure that no DMA mapping outlives the lifetime of the struct p2pdma_provider. The intended target of this new API layer is DMABUF. There is usually only a single p2pdma_provider for a DMABUF exporter. Most drivers can establish the p2pdma_provider during probe, access the single instance during DMABUF attach and use that to drive the DMA mapping. DMABUF provides an invalidation mechanism that can guarantee all DMA is halted and the DMA mappings are undone prior to destroying the struct p2pdma_provider. This ensures there is no UAF through DMABUFs that are lingering past driver removal. The new p2pdma_provider layer cannot be used to create P2P memory that can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and so on. These use cases must still use the mmap() layer. The p2pdma_provider layer is principally for DMABUF-like use cases where DMABUF natively manages the life cycle and access instead of vmas/pin_user_pages()/struct page. In addition, remove the bus_off field from pci_p2pdma_map_state since it duplicates information already available in the pgmap structure. The bus_offset is only used in one location (pci_p2pdma_bus_addr_map) and is always identical to pgmap->bus_offset. Signed-off-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-1-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/pci/p2pdma.c | 43 ++++++++++++++++++++------------------ include/linux/pci-p2pdma.h | 19 ++++++++++++----- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 78e108e47254..59cd6fb40e83 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -28,9 +28,8 @@ struct pci_p2pdma { }; struct pci_p2pdma_pagemap { - struct pci_dev *provider; - u64 bus_offset; struct dev_pagemap pgmap; + struct p2pdma_provider mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -204,8 +203,8 @@ static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ - struct pci_p2pdma *p2pdma = - rcu_dereference_protected(pgmap->provider->p2pdma, 1); + struct pci_p2pdma *p2pdma = rcu_dereference_protected( + to_pci_dev(pgmap->mem.owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -270,14 +269,15 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) static void pci_p2pdma_unmap_mappings(void *data) { - struct pci_dev *pdev = data; + struct pci_p2pdma_pagemap *p2p_pgmap = data; /* * Removing the alloc attribute from sysfs will call * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj, + &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -328,10 +328,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - - p2p_pgmap->provider = pdev; - p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - - pci_resource_start(pdev, bar); + p2p_pgmap->mem.owner = &pdev->dev; + p2p_pgmap->mem.bus_offset = + pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -340,7 +339,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, } error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, - pdev); + p2p_pgmap); if (error) goto pages_free; @@ -972,16 +971,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, - struct device *dev) +static enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; - struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; + struct pci_dev *pdev = to_pci_dev(provider->owner); struct pci_dev *client; struct pci_p2pdma *p2pdma; int dist; - if (!provider->p2pdma) + if (!pdev->p2pdma) return PCI_P2PDMA_MAP_NOT_SUPPORTED; if (!dev_is_pci(dev)) @@ -990,7 +989,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, client = to_pci_dev(dev); rcu_read_lock(); - p2pdma = rcu_dereference(provider->p2pdma); + p2pdma = rcu_dereference(pdev->p2pdma); if (p2pdma) type = xa_to_value(xa_load(&p2pdma->map_types, @@ -998,7 +997,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, rcu_read_unlock(); if (type == PCI_P2PDMA_MAP_UNKNOWN) - return calc_map_type_and_dist(provider, client, &dist, true); + return calc_map_type_and_dist(pdev, client, &dist, true); return type; } @@ -1006,9 +1005,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { - state->pgmap = page_pgmap(page); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; + struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); + + if (state->mem == &p2p_pgmap->mem) + return; + + state->mem = &p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev); } /** diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 951f81a38f3a..1400f3ad4299 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -16,6 +16,16 @@ struct block_device; struct scatterlist; +/** + * struct p2pdma_provider + * + * A p2pdma provider is a range of MMIO address space available to the CPU. + */ +struct p2pdma_provider { + struct device *owner; + u64 bus_offset; +}; + #ifdef CONFIG_PCI_P2PDMA int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); @@ -139,11 +149,11 @@ enum pci_p2pdma_map_type { }; struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; + struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; - u64 bus_off; }; + /* helper for pci_p2pdma_state(), do not use directly */ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page); @@ -162,8 +172,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); + __pci_p2pdma_update_state(state, dev, page); return state->map; } return PCI_P2PDMA_MAP_NONE; @@ -181,7 +190,7 @@ static inline dma_addr_t pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) { WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; + return paddr + state->mem->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ From d4504262f745e48c1739c8b864f779b4b0f9de80 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:21 +0200 Subject: [PATCH 30/65] PCI/P2PDMA: Simplify bus address mapping API Update the pci_p2pdma_bus_addr_map() function to take a direct pointer to the p2pdma_provider structure instead of the pci_p2pdma_map_state. This simplifies the API by removing the need for callers to extract the provider from the state structure. The change updates all callers across the kernel (block layer, IOMMU, DMA direct, and HMM) to pass the provider pointer directly, making the code more explicit and reducing unnecessary indirection. This also removes the runtime warning check since callers now have direct control over which provider they use. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-2-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- block/blk-mq-dma.c | 2 +- drivers/iommu/dma-iommu.c | 4 ++-- include/linux/pci-p2pdma.h | 7 +++---- kernel/dma/direct.c | 4 ++-- mm/hmm.c | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 449950029872..a1b623744b2f 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -85,7 +85,7 @@ static inline bool blk_can_dma_map_iova(struct request *req, static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) { - iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); + iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr); iter->len = vec->len; return true; } diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7944a3af4545..e52d19d2e833 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 1400f3ad4299..9516ef97b17a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -181,16 +181,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, /** * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. - * @state: P2P state structure + * @provider: P2P provider structure * @paddr: physical address to map * * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. */ static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr) { - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->mem->bus_offset; + return paddr + provider->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1f9ee9759426..d8b3dfc598b2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, } break; case PCI_P2PDMA_MAP_BUS_ADDR: - sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(sg)); + sg->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(sg)); sg_dma_mark_bus_address(sg); continue; default: diff --git a/mm/hmm.c b/mm/hmm.c index 87562914670a..9bf0b831a029 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -811,7 +811,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, break; case PCI_P2PDMA_MAP_BUS_ADDR: pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; - return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr); default: return DMA_MAPPING_ERROR; } From 372d6d1b8ae3cdfe6b0638a0a848c6865ec94567 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:22 +0200 Subject: [PATCH 31/65] PCI/P2PDMA: Refactor to separate core P2P functionality from memory allocation Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA functionality from the optional memory allocation layer. This creates a two-tier architecture: The core layer provides P2P mapping functionality for physical addresses based on PCI device MMIO BARs and integrates with the DMA API for mapping operations. This layer is required for all P2PDMA users. The optional upper layer provides memory allocation capabilities including gen_pool allocator, struct page support, and sysfs interface for user space access. This separation allows subsystems like DMABUF to use only the core P2P mapping functionality without the overhead of memory allocation features they don't need. The core functionality is now available through the new pcim_p2pdma_provider() function that returns a p2pdma_provider structure. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-3-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/pci/p2pdma.c | 153 +++++++++++++++++++++++++++++-------- include/linux/pci-p2pdma.h | 11 +++ 2 files changed, 132 insertions(+), 32 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 59cd6fb40e83..855d3493634c 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,11 +25,12 @@ struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct p2pdma_provider mem[PCI_STD_NUM_BARS]; }; struct pci_p2pdma_pagemap { struct dev_pagemap pgmap; - struct p2pdma_provider mem; + struct p2pdma_provider *mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -204,7 +205,7 @@ static void p2pdma_page_free(struct page *page) struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ struct pci_p2pdma *p2pdma = rcu_dereference_protected( - to_pci_dev(pgmap->mem.owner)->p2pdma, 1); + to_pci_dev(pgmap->mem->owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -227,44 +228,123 @@ static void pci_p2pdma_release(void *data) /* Flush and disable pci_alloc_p2p_mem() */ pdev->p2pdma = NULL; - synchronize_rcu(); + if (p2pdma->pool) + synchronize_rcu(); + xa_destroy(&p2pdma->map_types); + + if (!p2pdma->pool) + return; gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); - xa_destroy(&p2pdma->map_types); } -static int pci_p2pdma_setup(struct pci_dev *pdev) +/** + * pcim_p2pdma_init - Initialise peer-to-peer DMA providers + * @pdev: The PCI device to enable P2PDMA for + * + * This function initializes the peer-to-peer DMA infrastructure + * for a PCI device. It allocates and sets up the necessary data + * structures to support P2PDMA operations, including mapping type + * tracking. + */ +int pcim_p2pdma_init(struct pci_dev *pdev) { - int error = -ENOMEM; struct pci_p2pdma *p2p; + int i, ret; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2p) + return 0; p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); if (!p2p) return -ENOMEM; xa_init(&p2p->map_types); + /* + * Iterate over all standard PCI BARs and record only those that + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O + * port BARs) since they cannot be used for peer-to-peer (P2P) + * transactions. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); - if (!p2p->pool) - goto out; + p2p->mem[i].owner = &pdev->dev; + p2p->mem[i].bus_offset = + pci_bus_address(pdev, i) - pci_resource_start(pdev, i); + } - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); - if (error) - goto out_pool_destroy; - - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); - if (error) - goto out_pool_destroy; + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (ret) + goto out_p2p; rcu_assign_pointer(pdev->p2pdma, p2p); return 0; -out_pool_destroy: - gen_pool_destroy(p2p->pool); -out: +out_p2p: devm_kfree(&pdev->dev, p2p); - return error; + return ret; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_init); + +/** + * pcim_p2pdma_provider - Get peer-to-peer DMA provider + * @pdev: The PCI device to enable P2PDMA for + * @bar: BAR index to get provider + * + * This function gets peer-to-peer DMA provider for a PCI device. The lifetime + * of the provider (and of course the MMIO) is bound to the lifetime of the + * driver. A driver calling this function must ensure that all references to the + * provider, and any DMA mappings created for any MMIO, are all cleaned up + * before the driver remove() completes. + * + * Since P2P is almost always shared with a second driver this means some system + * to notify, invalidate and revoke the MMIO's DMA must be in place to use this + * function. For example a revoke can be built using DMABUF. + */ +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar) +{ + struct pci_p2pdma *p2p; + + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + return NULL; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (WARN_ON(!p2p)) + /* Someone forgot to call to pcim_p2pdma_init() before */ + return NULL; + + return &p2p->mem[bar]; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_provider); + +static int pci_p2pdma_setup_pool(struct pci_dev *pdev) +{ + struct pci_p2pdma *p2pdma; + int ret; + + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2pdma->pool) + /* We already setup pools, do nothing, */ + return 0; + + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); + if (!p2pdma->pool) + return -ENOMEM; + + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); + if (ret) + goto out_pool_destroy; + + return 0; + +out_pool_destroy: + gen_pool_destroy(p2pdma->pool); + p2pdma->pool = NULL; + return ret; } static void pci_p2pdma_unmap_mappings(void *data) @@ -276,7 +356,7 @@ static void pci_p2pdma_unmap_mappings(void *data) * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj, + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj, &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { struct pci_p2pdma_pagemap *p2p_pgmap; + struct p2pdma_provider *mem; struct dev_pagemap *pgmap; struct pci_p2pdma *p2pdma; void *addr; @@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (size + offset > pci_resource_len(pdev, bar)) return -EINVAL; - if (!pdev->p2pdma) { - error = pci_p2pdma_setup(pdev); - if (error) - return error; - } + error = pcim_p2pdma_init(pdev); + if (error) + return error; + + error = pci_p2pdma_setup_pool(pdev); + if (error) + return error; + + mem = pcim_p2pdma_provider(pdev, bar); + /* + * We checked validity of BAR prior to call + * to pcim_p2pdma_provider. It should never return NULL. + */ + if (WARN_ON(!mem)) + return -EINVAL; p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); if (!p2p_pgmap) @@ -328,9 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - p2p_pgmap->mem.owner = &pdev->dev; - p2p_pgmap->mem.bus_offset = - pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); + p2p_pgmap->mem = mem; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -1007,11 +1096,11 @@ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, { struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); - if (state->mem == &p2p_pgmap->mem) + if (state->mem == p2p_pgmap->mem) return; - state->mem = &p2p_pgmap->mem; - state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev); + state->mem = p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev); } /** diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 9516ef97b17a..15471252817b 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -27,6 +27,8 @@ struct p2pdma_provider { }; #ifdef CONFIG_PCI_P2PDMA +int pcim_p2pdma_init(struct pci_dev *pdev); +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, @@ -44,6 +46,15 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); #else /* CONFIG_PCI_P2PDMA */ +static inline int pcim_p2pdma_init(struct pci_dev *pdev) +{ + return -EOPNOTSUPP; +} +static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, + int bar) +{ + return NULL; +} static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { From 395698bd2cd7639b85784a4a8f5ddb7a581e353c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:23 +0200 Subject: [PATCH 32/65] PCI/P2PDMA: Provide an access to pci_p2pdma_map_type() function Provide an access to pci_p2pdma_map_type() function to allow subsystems to determine the appropriate mapping type for P2PDMA transfers between a provider and target device. The pci_p2pdma_map_type() function is the core P2P layer version of the existing public, but struct page focused, pci_p2pdma_state() function. It returns the same result. It is required to use the p2p subsystem from drivers that don't use the struct page layer. Like __pci_p2pdma_update_state() it is not an exported function. The idea is that only subsystem code will implement mapping helpers for taking in phys_addr_t lists, this is deliberately not made accessible to every driver to prevent abuse. Following patches will use this function to implement a shared DMA mapping helper for DMABUF. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-4-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/pci/p2pdma.c | 14 ++++++- include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 855d3493634c..981a76b6b7c0 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -1060,8 +1060,18 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type -pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +/** + * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers + * @provider: P2PDMA provider structure + * @dev: Target device for the transfer + * + * Determines how peer-to-peer DMA transfers should be mapped between + * the provider and the target device. The mapping type indicates whether + * the transfer can be done directly through PCI switches or must go + * through the host bridge. + */ +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; struct pci_dev *pdev = to_pci_dev(provider->owner); diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 15471252817b..517e121d2598 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -26,6 +26,45 @@ struct p2pdma_provider { u64 bus_offset; }; +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + #ifdef CONFIG_PCI_P2PDMA int pcim_p2pdma_init(struct pci_dev *pdev); struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); @@ -45,6 +84,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev); #else /* CONFIG_PCI_P2PDMA */ static inline int pcim_p2pdma_init(struct pci_dev *pdev) { @@ -106,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +{ + return PCI_P2PDMA_MAP_NOT_SUPPORTED; +} #endif /* CONFIG_PCI_P2PDMA */ @@ -120,45 +166,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before - * the mapping type has been calculated. Exported routines for the API - * will never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - struct pci_p2pdma_map_state { struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; From 50d44fce53b6474147fcfa0a27a6e5fd290dd8ed Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Nov 2025 11:28:24 +0200 Subject: [PATCH 33/65] PCI/P2PDMA: Document DMABUF model Reflect latest changes in p2p implementation to support DMABUF lifecycle. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-5-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- Documentation/driver-api/pci/p2pdma.rst | 95 +++++++++++++++++++------ 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst index d0b241628cf1..280673b50350 100644 --- a/Documentation/driver-api/pci/p2pdma.rst +++ b/Documentation/driver-api/pci/p2pdma.rst @@ -9,22 +9,48 @@ between two devices on the bus. This type of transaction is henceforth called Peer-to-Peer (or P2P). However, there are a number of issues that make P2P transactions tricky to do in a perfectly safe way. -One of the biggest issues is that PCI doesn't require forwarding -transactions between hierarchy domains, and in PCIe, each Root Port -defines a separate hierarchy domain. To make things worse, there is no -simple way to determine if a given Root Complex supports this or not. -(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel -only supports doing P2P when the endpoints involved are all behind the -same PCI bridge, as such devices are all in the same PCI hierarchy -domain, and the spec guarantees that all transactions within the -hierarchy will be routable, but it does not require routing -between hierarchies. +For PCIe the routing of Transaction Layer Packets (TLPs) is well-defined up +until they reach a host bridge or root port. If the path includes PCIe switches +then based on the ACS settings the transaction can route entirely within +the PCIe hierarchy and never reach the root port. The kernel will evaluate +the PCIe topology and always permit P2P in these well-defined cases. -The second issue is that to make use of existing interfaces in Linux, -memory that is used for P2P transactions needs to be backed by struct -pages. However, PCI BARs are not typically cache coherent so there are -a few corner case gotchas with these pages so developers need to -be careful about what they do with them. +However, if the P2P transaction reaches the host bridge then it might have to +hairpin back out the same root port, be routed inside the CPU SOC to another +PCIe root port, or routed internally to the SOC. + +The PCIe specification doesn't define the forwarding of transactions between +hierarchy domains and kernel defaults to blocking such routing. There is an +allow list to allow detecting known-good HW, in which case P2P between any +two PCIe devices will be permitted. + +Since P2P inherently is doing transactions between two devices it requires two +drivers to be co-operating inside the kernel. The providing driver has to convey +its MMIO to the consuming driver. To meet the driver model lifecycle rules the +MMIO must have all DMA mapping removed, all CPU accesses prevented, all page +table mappings undone before the providing driver completes remove(). + +This requires the providing and consuming driver to actively work together to +guarantee that the consuming driver has stopped using the MMIO during a removal +cycle. This is done by either a synchronous invalidation shutdown or waiting +for all usage refcounts to reach zero. + +At the lowest level the P2P subsystem offers a naked struct p2p_provider that +delegates lifecycle management to the providing driver. It is expected that +drivers using this option will wrap their MMIO memory in DMABUF and use DMABUF +to provide an invalidation shutdown. These MMIO addresess have no struct page, and +if used with mmap() must create special PTEs. As such there are very few +kernel uAPIs that can accept pointers to them; in particular they cannot be used +with read()/write(), including O_DIRECT. + +Building on this, the subsystem offers a layer to wrap the MMIO in a ZONE_DEVICE +pgmap of MEMORY_DEVICE_PCI_P2PDMA to create struct pages. The lifecycle of +pgmap ensures that when the pgmap is destroyed all other drivers have stopped +using the MMIO. This option works with O_DIRECT flows, in some cases, if the +underlying subsystem supports handling MEMORY_DEVICE_PCI_P2PDMA through +FOLL_PCI_P2PDMA. The use of FOLL_LONGTERM is prevented. As this relies on pgmap +it also relies on architecture support along with alignment and minimum size +limitations. Driver Writer's Guide @@ -114,14 +140,39 @@ allocating scatter-gather lists with P2P memory. Struct Page Caveats ------------------- -Driver writers should be very careful about not passing these special -struct pages to code that isn't prepared for it. At this time, the kernel -interfaces do not have any checks for ensuring this. This obviously -precludes passing these pages to userspace. +While the MEMORY_DEVICE_PCI_P2PDMA pages can be installed in VMAs, +pin_user_pages() and related will not return them unless FOLL_PCI_P2PDMA is set. -P2P memory is also technically IO memory but should never have any side -effects behind it. Thus, the order of loads and stores should not be important -and ioreadX(), iowriteX() and friends should not be necessary. +The MEMORY_DEVICE_PCI_P2PDMA pages require care to support in the kernel. The +KVA is still MMIO and must still be accessed through the normal +readX()/writeX()/etc helpers. Direct CPU access (e.g. memcpy) is forbidden, just +like any other MMIO mapping. While this will actually work on some +architectures, others will experience corruption or just crash in the kernel. +Supporting FOLL_PCI_P2PDMA in a subsystem requires scrubbing it to ensure no CPU +access happens. + + +Usage With DMABUF +================= + +DMABUF provides an alternative to the above struct page-based +client/provider/orchestrator system and should be used when struct page +doesn't exist. In this mode the exporting driver will wrap +some of its MMIO in a DMABUF and give the DMABUF FD to userspace. + +Userspace can then pass the FD to an importing driver which will ask the +exporting driver to map it to the importer. + +In this case the initiator and target pci_devices are known and the P2P subsystem +is used to determine the mapping type. The phys_addr_t-based DMA API is used to +establish the dma_addr_t. + +Lifecycle is controlled by DMABUF move_notify(). When the exporting driver wants +to remove() it must deliver an invalidation shutdown to all DMABUF importing +drivers through move_notify() and synchronously DMA unmap all the MMIO. + +No importing driver can continue to have a DMA map to the MMIO after the +exporting driver has destroyed its p2p_provider. P2P DMA Support Library From 3aa31a8bb11e47c0ff2b306988d1756b810c1c3c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:25 +0200 Subject: [PATCH 34/65] dma-buf: provide phys_vec to scatter-gather mapping routine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_buf_phys_vec_to_sgt() and dma_buf_free_sgt() helpers to convert an array of MMIO physical address ranges into scatter-gather tables with proper DMA mapping. These common functions are a starting point and support any PCI drivers creating mappings from their BAR's MMIO addresses. VFIO is one case, as shortly will be RDMA. We can review existing DRM drivers to refactor them separately. We hope this will evolve into routines to help common DRM that include mixed CPU and MMIO mappings. Compared to the dma_map_resource() abuse this implementation handles the complicated PCI P2P scenarios properly, especially when an IOMMU is enabled: - Direct bus address mapping without IOVA allocation for PCI_P2PDMA_MAP_BUS_ADDR, using pci_p2pdma_bus_addr_map(). This happens if the IOMMU is enabled but the PCIe switch ACS flags allow transactions to avoid the host bridge. Further, this handles the slightly obscure, case of MMIO with a phys_addr_t that is different from the physical BAR programming (bus offset). The phys_addr_t is converted to a dma_addr_t and accommodates this effect. This enables certain real systems to work, especially on ARM platforms. - Mapping through host bridge with IOVA allocation and DMA_ATTR_MMIO attribute for MMIO memory regions (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE). This happens when the IOMMU is enabled and the ACS flags are forcing all traffic to the IOMMU - ie for virtualization systems. - Cases where P2P is not supported through the host bridge/CPU. The P2P subsystem is the proper place to detect this and block it. Helper functions fill_sg_entry() and calc_sg_nents() handle the scatter-gather table construction, splitting large regions into UINT_MAX-sized chunks to fit within sg->length field limits. Since the physical address based DMA API forbids use of the CPU list of the scatterlist this will produce a mangled scatterlist that has a fully zero-length and NULL'd CPU list. The list is 0 length, all the struct page pointers are NULL and zero sized. This is stronger and more robust than the existing mangle_sg_table() technique. It is a future project to migrate DMABUF as a subsystem away from using scatterlist for this data structure. Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Christian König Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-6-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/dma-buf/Makefile | 2 +- drivers/dma-buf/dma-buf-mapping.c | 248 ++++++++++++++++++++++++++++++ include/linux/dma-buf-mapping.h | 17 ++ include/linux/dma-buf.h | 11 ++ 4 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 drivers/dma-buf/dma-buf-mapping.c create mode 100644 include/linux/dma-buf-mapping.h diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c..2008fb7481b3 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-fence-unwrap.o dma-resv.o + dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o obj-$(CONFIG_DMABUF_HEAPS) += heaps/ obj-$(CONFIG_SYNC_FILE) += sync_file.o diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c new file mode 100644 index 000000000000..b4819811a64a --- /dev/null +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA BUF Mapping Helpers + * + */ +#include +#include + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(size_t, length, UINT_MAX); + length -= len; + /* + * DMABUF abuses scatterlist to create a scatterlist + * that does not have any CPU list, only the DMA list. + * Always set the page related values to NULL to ensure + * importers can't use it. The phys_addr based DMA API + * does not require the CPU list for mapping or unmapping. + */ + sg_set_page(sgl, NULL, 0, 0); + sg_dma_address(sgl) = addr + i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct dma_iova_state *state, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size) +{ + unsigned int nents = 0; + size_t i; + + if (!state || !dma_use_iova(state)) { + for (i = 0; i < nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + } else { + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(size, UINT_MAX); + } + + return nents; +} + +/** + * struct dma_buf_dma - holds DMA mapping information + * @sgt: Scatter-gather table + * @state: DMA IOVA state relevant in IOMMU-based DMA + * @size: Total size of DMA transfer + */ +struct dma_buf_dma { + struct sg_table sgt; + struct dma_iova_state *state; + size_t size; +}; + +/** + * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment + * from arrays of physical vectors. This funciton is intended for MMIO memory + * only. + * @attach: [in] attachment whose scatterlist is to be returned + * @provider: [in] p2pdma provider + * @phys_vec: [in] array of physical vectors + * @nr_ranges: [in] number of entries in phys_vec array + * @size: [in] total size of phys_vec + * @dir: [in] direction of DMA transfer + * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. May return -EINTR if it is interrupted by a signal. + * + * On success, the DMA addresses and lengths in the returned scatterlist are + * PAGE_SIZE aligned. + * + * A mapping must be unmapped by using dma_buf_free_sgt(). + * + * NOTE: This function is intended for exporters. If direct traffic routing is + * mandatory exporter should call routing pci_p2pdma_map_type() before calling + * this function. + */ +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir) +{ + unsigned int nents, mapped_len = 0; + struct dma_buf_dma *dma; + struct scatterlist *sgl; + dma_addr_t addr; + size_t i; + int ret; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (WARN_ON(!attach || !attach->dmabuf || !provider)) + /* This function is supposed to work on MMIO memory only */ + return ERR_PTR(-EINVAL); + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) + return ERR_PTR(-ENOMEM); + + switch (pci_p2pdma_map_type(provider, attach->dev)) { + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + */ + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL); + if (!dma->state) { + ret = -ENOMEM; + goto err_free_dma; + } + + dma_iova_try_alloc(attach->dev, dma->state, 0, size); + break; + default: + ret = -EINVAL; + goto err_free_dma; + } + + nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size); + ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_free_state; + + sgl = dma->sgt.sgl; + + for (i = 0; i < nr_ranges; i++) { + if (!dma->state) { + addr = pci_p2pdma_bus_addr_map(provider, + phys_vec[i].paddr); + } else if (dma_use_iova(dma->state)) { + ret = dma_iova_link(attach->dev, dma->state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attach->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + ret = dma_mapping_error(attach->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!dma->state || !dma_use_iova(dma->state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (dma->state && dma_use_iova(dma->state)) { + WARN_ON_ONCE(mapped_len != size); + ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + + sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr); + } + + dma->size = size; + + /* + * No CPU list included — set orig_nents = 0 so others can detect + * this via SG table (use nents only). + */ + dma->sgt.orig_nents = 0; + + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return &dma->sgt; + +err_unmap_dma: + if (!i || !dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, mapped_len, dir, + DMA_ATTR_MMIO); + } else { + for_each_sgtable_dma_sg(&dma->sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + sg_free_table(&dma->sgt); +err_free_state: + kfree(dma->state); +err_free_dma: + kfree(dma); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF"); + +/** + * dma_buf_free_sgt- unmaps the buffer + * @attach: [in] attachment to unmap buffer from + * @sgt: [in] scatterlist info of the buffer to unmap + * @dir: [in] direction of DMA transfer + * + * This unmaps a DMA mapping for @attached obtained + * by dma_buf_phys_vec_to_sgt(). + */ +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt); + int i; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (!dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, dma->size, dir, + DMA_ATTR_MMIO); + } else { + struct scatterlist *sgl; + + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + + sg_free_table(sgt); + kfree(dma->state); + kfree(dma); + +} +EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF"); diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h new file mode 100644 index 000000000000..a3c0ce2d3a42 --- /dev/null +++ b/include/linux/dma-buf-mapping.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * DMA BUF Mapping Helpers + * + */ +#ifndef __DMA_BUF_MAPPING_H__ +#define __DMA_BUF_MAPPING_H__ +#include + +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir); +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir); +#endif diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d58e329ac0e7..0bc492090237 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -530,6 +531,16 @@ struct dma_buf_export_info { void *priv; }; +/** + * struct dma_buf_phys_vec - describe continuous chunk of memory + * @paddr: physical address of that chunk + * @len: Length of this chunk + */ +struct dma_buf_phys_vec { + phys_addr_t paddr; + size_t len; +}; + /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name From 64a5dedcff801072154a806102d731ecdf0e7552 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:26 +0200 Subject: [PATCH 35/65] vfio: Export vfio device get and put registration helpers These helpers are useful for managing additional references taken on the device from other associated VFIO modules. Original-patch-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 2 ++ include/linux/vfio.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..9aa4a5d081e8 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..217ba4ef1752 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -297,6 +297,8 @@ static inline void vfio_put_device(struct vfio_device *device) int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +void vfio_device_put_registration(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); From 47d13c939d89d348966857b24bb15092398ed8bb Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:27 +0200 Subject: [PATCH 36/65] vfio/pci: Share the core device pointer while invoking feature functions There is no need to share the main device pointer (struct vfio_device *) with all the feature functions as they only need the core device pointer. Therefore, extract the core device pointer once in the caller (vfio_pci_core_ioctl_feature) and share it instead. Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-8-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..ca9a95716a85 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -299,11 +299,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -320,12 +318,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -376,11 +372,9 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -1473,11 +1467,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1504,16 +1497,19 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); default: return -ENOTTY; } From 35c3503908d320989d76b735580fd93aa497185f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:28 +0200 Subject: [PATCH 37/65] vfio/pci: Enable peer-to-peer DMA transactions by default Make sure that all VFIO PCI devices have peer-to-peer capabilities enables, so we would be able to export their MMIO memory through DMABUF, VFIO has always supported P2P mappings with itself. VFIO type 1 insecurely reads PFNs directly out of a VMA's PTEs and programs them into the IOMMU allowing any two VFIO devices to perform P2P to each other. All existing VMMs use this capability to export P2P into a VM where the VM could setup any kind of DMA it likes. Projects like DPDK/SPDK are also known to make use of this, though less frequently. As a first step to more properly integrating VFIO with the P2P subsystem unconditionally enable P2P support for VFIO PCI devices. The struct p2pdma_provider will act has a handle to the P2P subsystem to do things like DMA mapping. While real PCI devices have to support P2P (they can't even tell if an IOVA is P2P or not) there may be fake PCI devices that may trigger some kind of catastrophic system failure. To date VFIO has never tripped up on such a case, but if one is discovered the plan is to add a PCI quirk and have pcim_p2pdma_init() fail. This will fully block the broken device throughout any users of the P2P subsystem in the kernel. Thus P2P through DMABUF will follow the historical VFIO model and be unconditionally enabled by vfio-pci. Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-9-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ca9a95716a85..142b84b3f225 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,6 +28,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_EEH) #include #endif @@ -2081,6 +2082,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2090,6 +2092,9 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); From 5d74781ebc86c5fa9e9d6934024c505412de9b52 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:29 +0200 Subject: [PATCH 38/65] vfio/pci: Add dma-buf export support for MMIO regions Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/Kconfig | 3 + drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 5 + drivers/vfio/pci/vfio_pci_config.c | 22 +- drivers/vfio/pci/vfio_pci_core.c | 18 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 316 +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 23 +++ include/linux/vfio_pci_core.h | 42 ++++ include/uapi/linux/vfio.h | 28 +++ 9 files changed, 453 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..2b9fca00e9e8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..53f59226ae01 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..6d41cf26b539 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4..1f6008eabf23 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 142b84b3f225..9449cf44c18a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -287,6 +287,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -370,6 +372,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -690,6 +694,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); @@ -1222,7 +1228,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1511,6 +1520,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -2095,6 +2106,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) ret = pcim_p2pdma_init(vdev->pdev); if (ret && ret != -EOPNOTSUPP) return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2459,6 +2471,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2487,8 +2500,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..6698f540bdac --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include +#include +#include + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..28a405f8b97c 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..c9466ba323fa 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -26,6 +26,8 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct p2pdma_provider; +struct dma_buf_phys_vec; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -49,9 +51,48 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF) +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len); +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +#else +static inline int +vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + return -EINVAL; +} +static inline int vfio_pci_core_get_dmabuf_phys( + struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider, + unsigned int region_index, struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, size_t nr_ranges) +{ + return -EOPNOTSUPP; +} +#endif + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map; @@ -94,6 +135,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; }; /* Will be exported for vfio pci drivers usage */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009ba..ac2329f24141 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include #include +#include #define VFIO_API_VERSION 0 @@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the + * regions selected. + * + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, + * etc. offset/length specify a slice of the region to create the dmabuf from. + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. + * + * flags should be 0. + * + * Return: The fd number on success, -1 and errno is set on failure. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 + +struct vfio_region_dma_range { + __u64 offset; + __u64 length; +}; + +struct vfio_device_feature_dma_buf { + __u32 region_index; + __u32 open_flags; + __u32 flags; + __u32 nr_ranges; + struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges); +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** From 5415d887db0e059920cb5673a32cc4d66daa280f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Nov 2025 11:28:30 +0200 Subject: [PATCH 39/65] vfio/nvgrace: Support get_dmabuf_phys Call vfio_pci_core_fill_phys_vec() with the proper physical ranges for the synthetic BAR 2 and BAR 4 regions. Otherwise use the normal flow based on the PCI bar. This demonstrates a DMABUF that follows the region info report to only allow mapping parts of the region that are mmapable. Since the BAR is power of two sized and the "CXL" region is just page aligned the there can be a padding region at the end that is not mmaped or passed into the DMABUF. The "CXL" ranges that are remapped into BAR 2 and BAR 4 areas are not PCI MMIO, they actually run over the CXL-like coherent interconnect and for the purposes of DMA behave identically to DRAM. We don't try to model this distinction between true PCI BAR memory that takes a real PCI path and the "CXL" memory that takes a different path in the p2p framework for now. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-11-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..f19a679b5bf2 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,7 @@ #include #include #include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -683,6 +684,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -703,6 +748,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -965,6 +1014,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); From 98693e0897f754e3f51ce6626ed5f785f625ba2b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Nov 2025 15:36:22 -0700 Subject: [PATCH 40/65] vfio/pci: Use RCU for error/request triggers to avoid circular locking Thanks to a device generating an ACS violation during bus reset, lockdep reported the following circular locking issue: CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem CPU2: AER: holds pci_bus_sem, acquires igate This results in a potential 3-way deadlock. Remove the pci_bus_sem->igate leg of the triangle by using RCU to peek at the eventfd rather than locking it with igate. Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup") Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 68 ++++++++++++++++++++++--------- drivers/vfio/pci/vfio_pci_intrs.c | 52 ++++++++++++++--------- drivers/vfio/pci/vfio_pci_priv.h | 4 ++ include/linux/vfio_pci_core.h | 10 ++++- 4 files changed, 93 insertions(+), 41 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 79a1a50a4ef7..2b01bfbce3ea 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -42,6 +42,40 @@ static bool nointxmask; static bool disable_vga; static bool disable_idle_d3; +static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu) +{ + struct vfio_pci_eventfd *eventfd = + container_of(rcu, struct vfio_pci_eventfd, rcu); + + eventfd_ctx_put(eventfd->ctx); + kfree(eventfd); +} + +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx) +{ + struct vfio_pci_eventfd *new = NULL; + struct vfio_pci_eventfd *old; + + lockdep_assert_held(&vdev->igate); + + if (ctx) { + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->ctx = ctx; + } + + old = rcu_replace_pointer(*peventfd, new, + lockdep_is_held(&vdev->igate)); + if (old) + call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free); + + return 0; +} + /* List of PF's that vfio_pci_core_sriov_configure() has been called on */ static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); @@ -697,14 +731,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vfio_pci_dma_buf_cleanup(vdev); mutex_lock(&vdev->igate); - if (vdev->err_trigger) { - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = NULL; - } - if (vdev->req_trigger) { - eventfd_ctx_put(vdev->req_trigger); - vdev->req_trigger = NULL; - } + vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); + vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); mutex_unlock(&vdev->igate); } EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); @@ -1784,21 +1812,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->req_trigger) { + rcu_read_lock(); + eventfd = rcu_dereference(vdev->req_trigger); + if (eventfd) { if (!(count % 10)) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger); + eventfd_signal(eventfd->ctx); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); } - - mutex_unlock(&vdev->igate); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(vfio_pci_core_request); @@ -2216,13 +2244,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger); - - mutex_unlock(&vdev->igate); + rcu_read_lock(); + eventfd = rcu_dereference(vdev->err_trigger); + if (eventfd) + eventfd_signal(eventfd->ctx); + rcu_read_unlock(); return PCI_ERS_RESULT_CAN_RECOVER; } diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 30d3e921cb0d..c76e753b3cec 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, +static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, unsigned int count, uint32_t flags, void *data) { /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (*ctx) { - if (count) { - eventfd_signal(*ctx); - } else { - eventfd_ctx_put(*ctx); - *ctx = NULL; - } + struct vfio_pci_eventfd *eventfd; + + eventfd = rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (!eventfd) + return -EINVAL; + + if (count) { + eventfd_signal(eventfd->ctx); return 0; } + + return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger; @@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, return -EINVAL; trigger = *(uint8_t *)data; - if (trigger && *ctx) - eventfd_signal(*ctx); + + if (trigger) { + struct vfio_pci_eventfd *eventfd = + rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (eventfd) + eventfd_signal(eventfd->ctx); + } return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { @@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, fd = *(int32_t *)data; if (fd == -1) { - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = NULL; + return vfio_pci_eventfd_replace_locked(vdev, + peventfd, NULL); } else if (fd >= 0) { struct eventfd_ctx *efdctx; + int ret; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); - if (*ctx) - eventfd_ctx_put(*ctx); + ret = vfio_pci_eventfd_replace_locked(vdev, + peventfd, efdctx); + if (ret) + eventfd_ctx_put(efdctx); - *ctx = efdctx; + return ret; } - return 0; } return -EINVAL; @@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger, count, flags, data); } @@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger, count, flags, data); } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 28a405f8b97c..6681389518a7 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd { bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx); + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 88fd2fd895d0..a1eddd55dab8 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,11 @@ struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_phys_vec; +struct vfio_pci_eventfd { + struct eventfd_ctx *ctx; + struct rcu_head rcu; +}; + struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -124,8 +130,8 @@ struct vfio_pci_core_device { struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; - struct eventfd_ctx *err_trigger; - struct eventfd_ctx *req_trigger; + struct vfio_pci_eventfd __rcu *err_trigger; + struct vfio_pci_eventfd __rcu *req_trigger; struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; From 590d745680309f8d956c3f0a97270fe65013b272 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 25 Nov 2025 17:11:18 -0800 Subject: [PATCH 41/65] dma-buf: fix integer overflow in fill_sg_entry() for buffers >= 8GiB fill_sg_entry() splits large DMA buffers into multiple scatter-gather entries, each holding up to UINT_MAX bytes. When calculating the DMA address for entries beyond the second one, the expression (i * UINT_MAX) causes integer overflow due to 32-bit arithmetic. This manifests when the input arg length >= 8 GiB results in looping for i >= 2. Fix by casting i to dma_addr_t before multiplication. Fixes: 3aa31a8bb11e ("dma-buf: provide phys_vec to scatter-gather mapping routine") Signed-off-by: Alex Mastro Reviewed-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20251125-dma-buf-overflow-v1-1-b70ea1e6c4ba@fb.com Signed-off-by: Alex Williamson --- drivers/dma-buf/dma-buf-mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c index b4819811a64a..b7352e609fbd 100644 --- a/drivers/dma-buf/dma-buf-mapping.c +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -24,7 +24,7 @@ static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, * does not require the CPU list for mapping or unmapping. */ sg_set_page(sgl, NULL, 0, 0); - sg_dma_address(sgl) = addr + i * UINT_MAX; + sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX; sg_dma_len(sgl) = len; sgl = sg_next(sgl); } From 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:27 +0000 Subject: [PATCH 42/65] vfio: refactor vfio_pci_mmap_huge_fault function Refactor vfio_pci_mmap_huge_fault to take out the implementation to map the VMA to the PTE/PMD/PUD as a separate function. Export the new function to be used by nvgrace-gpu module. Move the alignment check code to verify that pfn and VMA VA is aligned to the page order to the header file and make it inline. No functional change is intended. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 64 ++++++++++++++++---------------- include/linux/vfio_pci_core.h | 13 +++++++ 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 2b01bfbce3ea..9bb700d25ccb 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1652,6 +1652,34 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, + unsigned long pfn, + unsigned int order) +{ + lockdep_assert_held_read(&vdev->memory_lock); + + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + return VM_FAULT_SIGBUS; + + switch (order) { + case 0: + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + case PMD_ORDER: + return vmf_insert_pfn_pmd(vmf, pfn, false); +#endif +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + case PUD_ORDER: + return vmf_insert_pfn_pud(vmf, pfn, false); + break; +#endif + default: + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn); + static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned int order) { @@ -1660,41 +1688,13 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; unsigned long pfn = vma_to_pfn(vma) + pgoff; - vm_fault_t ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_FALLBACK; - if (order && (addr < vma->vm_start || - addr + (PAGE_SIZE << order) > vma->vm_end || - pfn & ((1 << order) - 1))) { - ret = VM_FAULT_FALLBACK; - goto out; + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); } - down_read(&vdev->memory_lock); - - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_unlock; - - switch (order) { - case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn); - break; -#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP - case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, pfn, false); - break; -#endif -#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP - case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, pfn, false); - break; -#endif - default: - ret = VM_FAULT_FALLBACK; - } - -out_unlock: - up_read(&vdev->memory_lock); -out: dev_dbg_ratelimited(&vdev->pdev->dev, "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", __func__, order, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a1eddd55dab8..5569488ec4dc 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -170,6 +170,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, unsigned long pfn, + unsigned int order); int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); @@ -212,4 +215,14 @@ VFIO_IOREAD_DECLARATION(32) VFIO_IOREAD_DECLARATION(64) #endif +static inline bool is_aligned_for_order(struct vm_area_struct *vma, + unsigned long addr, + unsigned long pfn, + unsigned int order) +{ + return !(order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + !IS_ALIGNED(pfn, 1 << order))); +} + #endif /* VFIO_PCI_CORE_H */ From 9db65489b87298f20baef683e70143c108959793 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:28 +0000 Subject: [PATCH 43/65] vfio/nvgrace-gpu: Add support for huge pfnmap NVIDIA's Grace based systems have large device memory. The device memory is mapped as VM_PFNMAP in the VMM VMA. The nvgrace-gpu module could make use of the huge PFNMAP support added in mm [1]. To make use of the huge pfnmap support, fault/huge_fault ops based mapping mechanism needs to be implemented. Currently nvgrace-gpu module relies on remap_pfn_range to do the mapping during VM bootup. Replace it to instead rely on fault and use vfio_pci_vmf_insert_pfn to setup the mapping. Moreover to enable huge pfnmap, nvgrace-gpu module is updated by adding huge_fault ops implementation. The implementation establishes mapping according to the order request. Note that if the PFN or the VMA address is unaligned to the order, the mapping fallbacks to the PTE level. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Zhi Wang Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-3-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 81 +++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e33f24fbb0a4..3034d6adf576 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -131,6 +131,59 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -138,10 +191,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -158,17 +209,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -185,23 +237,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; return 0; } From 7f5764e179c66f4d3776b4fcb78b383ebcd99fb3 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:29 +0000 Subject: [PATCH 44/65] vfio: use vfio_pci_core_setup_barmap to map bar in mmap Remove code duplication in vfio_pci_core_mmap by calling vfio_pci_core_setup_barmap to perform the bar mapping. No functional change is intended. Cc: Donald Dutile Reviewed-by: Shameer Kolothum Reviewed-by: Zhi Wang Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-4-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9bb700d25ccb..3a11e6f450f7 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1761,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); From 7d055071d73bac7acc3a0b441f0632f564f048fe Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:30 +0000 Subject: [PATCH 45/65] vfio/nvgrace-gpu: split the code to wait for GPU ready Split the function that check for the GPU device being ready on the probe. Move the code to wait for the GPU to be ready through BAR0 register reads to a separate function. This would help reuse the code. This also fixes a bug where the return status in case of timeout gets overridden by return from pci_enable_device. With the fix, a timeout generate an error as initially intended. Fixes: d85f69d520e6 ("vfio/nvgrace-gpu: Check the HBM training and C2C link status") Reviewed-by: Zhi Wang Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-5-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 3034d6adf576..5f122929dd44 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -131,6 +131,20 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + static unsigned long addr_to_pgoff(struct vm_area_struct *vma, unsigned long addr) { @@ -950,11 +964,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -970,16 +983,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -996,7 +1001,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; From dfe765499abfbdb9df0f2e70ebb1199c7d2fc65e Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:31 +0000 Subject: [PATCH 46/65] vfio/nvgrace-gpu: Inform devmem unmapped after reset Introduce a new flag reset_done to notify that the GPU has just been reset and the mapping to the GPU memory is zapped. Implement the reset_done handler to set this new variable. It will be used later in the patches to wait for the GPU memory to be ready before doing any mapping or access. Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-6-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 5f122929dd44..df360e12b299 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -59,6 +59,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -1067,12 +1069,34 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; From a23b10608d420346e5af7eda6c46726a61572469 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:32 +0000 Subject: [PATCH 47/65] vfio/nvgrace-gpu: wait for the GPU mem to be ready Speculative prefetches from CPU to GPU memory until the GPU is ready after reset can cause harmless corrected RAS events to be logged on Grace systems. It is thus preferred that the mapping not be re-established until the GPU is ready post reset. The GPU readiness can be checked through BAR0 registers similar to the checking at the time of device probe. It can take several seconds for the GPU to be ready. So it is desirable that the time overlaps as much of the VM startup as possible to reduce impact on the VM bootup time. The GPU readiness state is thus checked on the first fault/huge_fault request or read/write access which amortizes the GPU readiness time. The first fault and read/write checks the GPU state when the reset_done flag - which denotes whether the GPU has just been reset. The memory_lock is taken across map/access to avoid races with GPU reset. Also check if the memory is enabled, before waiting for GPU to be ready. Otherwise the readiness check would block for 30s. Lastly added PM handling wrapping on read/write access. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-7-ankita@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 103 ++++++++++++++++++++++++---- drivers/vfio/pci/vfio_pci_config.c | 1 + drivers/vfio/pci/vfio_pci_priv.h | 1 - include/linux/vfio_pci_core.h | 1 + 4 files changed, 93 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index df360e12b299..84d142a47ec6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -105,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -147,6 +161,34 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io) return -ETIME; } +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + static unsigned long addr_to_pgoff(struct vm_area_struct *vma, unsigned long addr) { @@ -176,8 +218,13 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); if (is_aligned_for_order(vma, addr, pfn, order)) { - scoped_guard(rwsem_read, &vdev->memory_lock) + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } } dev_dbg_ratelimited(&vdev->pdev->dev, @@ -533,6 +580,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -559,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -586,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -650,6 +711,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -679,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -695,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -1074,7 +1149,11 @@ MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); * faults and read/writes accesses to prevent potential RAS events logging. * * First fault or access after a reset needs to poll device readiness, - * flag that a reset has occurred. + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. */ static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) { diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 1f6008eabf23..dc4e510e6e1b 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 6681389518a7..27ac280f00b9 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -64,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5569488ec4dc..336a0e58b443 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -188,6 +188,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite); +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, loff_t *buf_offset, From 2d5dbd3156799db2b7d25addb45c289b0aa7ff11 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:16 +0000 Subject: [PATCH 48/65] vfio: selftests: Move run.sh into scripts directory Move run.sh in a new sub-directory scripts/. This directory will be used to house various helper scripts to be used by humans and automation for running VFIO selftests. Opportunistically also switch run.sh from TEST_PROGS_EXTENDED to TEST_FILES. The former is for actual test executables that are just not run by default. TEST_FILES is a better fit for helper scripts. No functional change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-2-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 4 +++- tools/testing/selftests/vfio/{ => scripts}/run.sh | 0 2 files changed, 3 insertions(+), 1 deletion(-) rename tools/testing/selftests/vfio/{ => scripts}/run.sh (100%) diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 324ba0175a33..ad7fa4071c20 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -3,7 +3,9 @@ TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_driver_test -TEST_PROGS_EXTENDED := run.sh + +TEST_FILES += scripts/run.sh + include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/run.sh b/tools/testing/selftests/vfio/scripts/run.sh similarity index 100% rename from tools/testing/selftests/vfio/run.sh rename to tools/testing/selftests/vfio/scripts/run.sh From fa246a1d06d74219b79d3e78b0cf47851a60d117 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:17 +0000 Subject: [PATCH 49/65] vfio: selftests: Split run.sh into separate scripts Split run.sh into separate scripts (setup.sh, run.sh, cleanup.sh) to enable multi-device testing, and prepare for VFIO selftests automatically detecting which devices to use for testing by storing device metadata on the filesystem. - setup.sh takes one or more BDFs as arguments and sets up each device. Metadata about each device is stored on the filesystem in the directory: ${TMPDIR:-/tmp}/vfio-selftests-devices Within this directory is a directory for each BDF, and then files in those directories that cleanup.sh uses to cleanup the device. - run.sh runs a selftest by passing it the BDFs of all set up devices. - cleanup.sh takes zero or more BDFs as arguments and cleans up each device. If no BDFs are provided, it cleans up all devices. This split enables multi-device testing by allowing multiple BDFs to be set up and passed into tests: For example: $ tools/testing/selftests/vfio/scripts/setup.sh $ tools/testing/selftests/vfio/scripts/setup.sh $ tools/testing/selftests/vfio/scripts/run.sh echo $ tools/testing/selftests/vfio/scripts/cleanup.sh In the future, VFIO selftests can automatically detect set up devices by inspecting ${TMPDIR:-/tmp}/vfio-selftests-devices. This will avoid the need for the run.sh script. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-3-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 3 + .../testing/selftests/vfio/scripts/cleanup.sh | 41 +++++++ tools/testing/selftests/vfio/scripts/lib.sh | 42 +++++++ tools/testing/selftests/vfio/scripts/run.sh | 105 +----------------- tools/testing/selftests/vfio/scripts/setup.sh | 48 ++++++++ 5 files changed, 140 insertions(+), 99 deletions(-) create mode 100755 tools/testing/selftests/vfio/scripts/cleanup.sh create mode 100755 tools/testing/selftests/vfio/scripts/lib.sh create mode 100755 tools/testing/selftests/vfio/scripts/setup.sh diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index ad7fa4071c20..a29f99395206 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -4,7 +4,10 @@ TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_driver_test +TEST_FILES += scripts/cleanup.sh +TEST_FILES += scripts/lib.sh TEST_FILES += scripts/run.sh +TEST_FILES += scripts/setup.sh include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/scripts/cleanup.sh b/tools/testing/selftests/vfio/scripts/cleanup.sh new file mode 100755 index 000000000000..69c922d8aafb --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/cleanup.sh @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function cleanup_devices() { + local device_bdf + local device_dir + + for device_bdf in "$@"; do + device_dir=${DEVICES_DIR}/${device_bdf} + + if [ -f ${device_dir}/vfio-pci ]; then + unbind ${device_bdf} vfio-pci + fi + + if [ -f ${device_dir}/driver_override ]; then + clear_driver_override ${device_bdf} + fi + + if [ -f ${device_dir}/driver ]; then + bind ${device_bdf} $(cat ${device_dir}/driver) + fi + + if [ -f ${device_dir}/sriov_numvfs ]; then + set_sriov_numvfs ${device_bdf} $(cat ${device_dir}/sriov_numvfs) + fi + + rm -rf ${device_dir} + done +} + +function main() { + if [ $# = 0 ]; then + cleanup_devices $(ls ${DEVICES_DIR}) + rmdir ${DEVICES_DIR} + else + cleanup_devices "$@" + fi +} + +main "$@" diff --git a/tools/testing/selftests/vfio/scripts/lib.sh b/tools/testing/selftests/vfio/scripts/lib.sh new file mode 100755 index 000000000000..9f05f29c7b86 --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/lib.sh @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +readonly DEVICES_DIR="${TMPDIR:-/tmp}/vfio-selftests-devices" + +function write_to() { + # Unfortunately set -x does not show redirects so use echo to manually + # tell the user what commands are being run. + echo "+ echo \"${2}\" > ${1}" + echo "${2}" > ${1} +} + +function get_driver() { + if [ -L /sys/bus/pci/devices/${1}/driver ]; then + basename $(readlink -m /sys/bus/pci/devices/${1}/driver) + fi +} + +function bind() { + write_to /sys/bus/pci/drivers/${2}/bind ${1} +} + +function unbind() { + write_to /sys/bus/pci/drivers/${2}/unbind ${1} +} + +function set_sriov_numvfs() { + write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} +} + +function get_sriov_numvfs() { + if [ -f /sys/bus/pci/devices/${1}/sriov_numvfs ]; then + cat /sys/bus/pci/devices/${1}/sriov_numvfs + fi +} + +function set_driver_override() { + write_to /sys/bus/pci/devices/${1}/driver_override ${2} +} + +function clear_driver_override() { + set_driver_override ${1} "" +} diff --git a/tools/testing/selftests/vfio/scripts/run.sh b/tools/testing/selftests/vfio/scripts/run.sh index 0476b6d7adc3..91fd38f9f6f6 100755 --- a/tools/testing/selftests/vfio/scripts/run.sh +++ b/tools/testing/selftests/vfio/scripts/run.sh @@ -1,109 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-or-later -# Global variables initialized in main() and then used during cleanup() when -# the script exits. -declare DEVICE_BDF -declare NEW_DRIVER -declare OLD_DRIVER -declare OLD_NUMVFS -declare DRIVER_OVERRIDE - -function write_to() { - # Unfortunately set -x does not show redirects so use echo to manually - # tell the user what commands are being run. - echo "+ echo \"${2}\" > ${1}" - echo "${2}" > ${1} -} - -function bind() { - write_to /sys/bus/pci/drivers/${2}/bind ${1} -} - -function unbind() { - write_to /sys/bus/pci/drivers/${2}/unbind ${1} -} - -function set_sriov_numvfs() { - write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} -} - -function set_driver_override() { - write_to /sys/bus/pci/devices/${1}/driver_override ${2} -} - -function clear_driver_override() { - set_driver_override ${1} "" -} - -function cleanup() { - if [ "${NEW_DRIVER}" ]; then unbind ${DEVICE_BDF} ${NEW_DRIVER} ; fi - if [ "${DRIVER_OVERRIDE}" ]; then clear_driver_override ${DEVICE_BDF} ; fi - if [ "${OLD_DRIVER}" ]; then bind ${DEVICE_BDF} ${OLD_DRIVER} ; fi - if [ "${OLD_NUMVFS}" ]; then set_sriov_numvfs ${DEVICE_BDF} ${OLD_NUMVFS} ; fi -} - -function usage() { - echo "usage: $0 [-d segment:bus:device.function] [-s] [-h] [cmd ...]" >&2 - echo >&2 - echo " -d: The BDF of the device to use for the test (required)" >&2 - echo " -h: Show this help message" >&2 - echo " -s: Drop into a shell rather than running a command" >&2 - echo >&2 - echo " cmd: The command to run and arguments to pass to it." >&2 - echo " Required when not using -s. The SBDF will be " >&2 - echo " appended to the argument list." >&2 - exit 1 -} +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh function main() { - local shell + local device_bdfs=$(ls ${DEVICES_DIR}) - while getopts "d:hs" opt; do - case $opt in - d) DEVICE_BDF="$OPTARG" ;; - s) shell=true ;; - *) usage ;; - esac - done - - # Shift past all optional arguments. - shift $((OPTIND - 1)) - - # Check that the user passed in the command to run. - [ ! "${shell}" ] && [ $# = 0 ] && usage - - # Check that the user passed in a BDF. - [ "${DEVICE_BDF}" ] || usage - - trap cleanup EXIT - set -e - - test -d /sys/bus/pci/devices/${DEVICE_BDF} - - if [ -f /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs ]; then - OLD_NUMVFS=$(cat /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs) - set_sriov_numvfs ${DEVICE_BDF} 0 + if [ -z "${device_bdfs}" ]; then + echo "No devices found, skipping." + exit 4 fi - if [ -L /sys/bus/pci/devices/${DEVICE_BDF}/driver ]; then - OLD_DRIVER=$(basename $(readlink -m /sys/bus/pci/devices/${DEVICE_BDF}/driver)) - unbind ${DEVICE_BDF} ${OLD_DRIVER} - fi - - set_driver_override ${DEVICE_BDF} vfio-pci - DRIVER_OVERRIDE=true - - bind ${DEVICE_BDF} vfio-pci - NEW_DRIVER=vfio-pci - - echo - if [ "${shell}" ]; then - echo "Dropping into ${SHELL} with VFIO_SELFTESTS_BDF=${DEVICE_BDF}" - VFIO_SELFTESTS_BDF=${DEVICE_BDF} ${SHELL} - else - "$@" ${DEVICE_BDF} - fi - echo + "$@" ${device_bdfs} } main "$@" diff --git a/tools/testing/selftests/vfio/scripts/setup.sh b/tools/testing/selftests/vfio/scripts/setup.sh new file mode 100755 index 000000000000..49a499e51cbe --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/setup.sh @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +set -e + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function main() { + local device_bdf + local device_dir + local numvfs + local driver + + if [ $# = 0 ]; then + echo "usage: $0 segment:bus:device.function ..." >&2 + exit 1 + fi + + for device_bdf in "$@"; do + test -d /sys/bus/pci/devices/${device_bdf} + + device_dir=${DEVICES_DIR}/${device_bdf} + if [ -d "${device_dir}" ]; then + echo "${device_bdf} has already been set up, exiting." + exit 0 + fi + + mkdir -p ${device_dir} + + numvfs=$(get_sriov_numvfs ${device_bdf}) + if [ "${numvfs}" ]; then + set_sriov_numvfs ${device_bdf} 0 + echo ${numvfs} > ${device_dir}/sriov_numvfs + fi + + driver=$(get_driver ${device_bdf}) + if [ "${driver}" ]; then + unbind ${device_bdf} ${driver} + echo ${driver} > ${device_dir}/driver + fi + + set_driver_override ${device_bdf} vfio-pci + touch ${device_dir}/driver_override + + bind ${device_bdf} vfio-pci + touch ${device_dir}/vfio-pci + done +} + +main "$@" From 6282ca8585010bfb4ab658243a1665428bcfa9de Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:18 +0000 Subject: [PATCH 50/65] vfio: selftests: Allow passing multiple BDFs on the command line Add support for passing multiple device BDFs to a test via the command line. This is a prerequisite for multi-device tests. Single-device tests can continue using vfio_selftests_get_bdf(), which will continue to return argv[argc - 1] (if it is a BDF string), or the environment variable $VFIO_SELFTESTS_BDF otherwise. For multi-device tests, a new helper called vfio_selftests_get_bdfs() is introduced which will return an array of all BDFs found at the end of argv[], as well as the number of BDFs found (passed back to the caller via argument). The array of BDFs returned does not need to be freed by the caller. The environment variable VFIO_SELFTESTS_BDF continues to support only a single BDF for the time being. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-4-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 2 + .../selftests/vfio/lib/vfio_pci_device.c | 57 +++++++++++++++---- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 69ec0c856481..0b9a5628d23e 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -208,6 +208,8 @@ struct iova_allocator { * If BDF cannot be determined then the test will exit with KSFT_SKIP. */ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); + const char *vfio_pci_get_cdev_path(const char *bdf); extern const char *default_iommu_mode; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index b479a359da12..eda8f14de797 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -868,29 +868,64 @@ static bool is_bdf(const char *str) return count == 4 && length == strlen(str); } -const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs) +{ + int i; + + for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--) + continue; + + i++; + *nr_bdfs = *argc - i; + *argc -= *nr_bdfs; + + return *nr_bdfs ? &argv[i] : NULL; +} + +static char *get_bdf_env(void) { char *bdf; - if (*argc > 1 && is_bdf(argv[*argc - 1])) - return argv[--(*argc)]; - bdf = getenv("VFIO_SELFTESTS_BDF"); - if (bdf) { - VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); - return bdf; + if (!bdf) + return NULL; + + VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); + return bdf; +} + +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs) +{ + static char *env_bdf; + char **bdfs; + + bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs); + if (bdfs) + return bdfs; + + env_bdf = get_bdf_env(); + if (env_bdf) { + *nr_bdfs = 1; + return &env_bdf; } - fprintf(stderr, "Unable to determine which device to use, skipping test.\n"); + fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n"); fprintf(stderr, "\n"); fprintf(stderr, "To pass the device address via environment variable:\n"); fprintf(stderr, "\n"); - fprintf(stderr, " export VFIO_SELFTESTS_BDF=segment:bus:device.function\n"); + fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n"); fprintf(stderr, " %s [options]\n", argv[0]); fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via argv:\n"); + fprintf(stderr, "To pass the device address(es) via argv:\n"); fprintf(stderr, "\n"); - fprintf(stderr, " %s [options] segment:bus:device.function\n", argv[0]); + fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]); fprintf(stderr, "\n"); exit(KSFT_SKIP); } + +const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +{ + int nr_bdfs; + + return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; +} From dd56ef239d832d9782f5ed204533824bea4548f7 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:19 +0000 Subject: [PATCH 51/65] vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode Rename struct vfio_iommu_mode to struct iommu_mode since the mode can include iommufd. This also prepares for splitting out all the IOMMU code into its own structs/helpers/files which are independent from the vfio_pci_device code. No function change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-5-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 4 ++-- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 0b9a5628d23e..2f5555138d7f 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -50,7 +50,7 @@ VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ } while (0) -struct vfio_iommu_mode { +struct iommu_mode { const char *name; const char *container_path; unsigned long iommu_type; @@ -166,7 +166,7 @@ struct vfio_pci_driver { struct vfio_pci_device { int fd; - const struct vfio_iommu_mode *iommu_mode; + const struct iommu_mode *iommu_mode; int group_fd; int container_fd; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index eda8f14de797..4a021ff4fc40 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -713,7 +713,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) } /* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ -static const struct vfio_iommu_mode iommu_modes[] = { +static const struct iommu_mode iommu_modes[] = { { .name = "vfio_type1_iommu", .container_path = "/dev/vfio/vfio", @@ -741,7 +741,7 @@ static const struct vfio_iommu_mode iommu_modes[] = { const char *default_iommu_mode = "iommufd"; -static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) +static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) { int i; From c9756b4d2702cc33748eed5916ddc159d349cba6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:20 +0000 Subject: [PATCH 52/65] vfio: selftests: Introduce struct iommu Introduce struct iommu, which logically represents either a VFIO container or an iommufd IOAS, depending on which IOMMU mode is used by the test. This will be used in a subsequent commit to allow devices to be added to the same container/iommufd. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-6-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 18 +++-- .../selftests/vfio/lib/vfio_pci_device.c | 75 ++++++++++--------- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 2f5555138d7f..3160f2d1ea6d 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -163,15 +163,19 @@ struct vfio_pci_driver { int msi; }; -struct vfio_pci_device { - int fd; - - const struct iommu_mode *iommu_mode; - int group_fd; +struct iommu { + const struct iommu_mode *mode; int container_fd; - int iommufd; u32 ioas_id; + struct list_head dma_regions; +}; + +struct vfio_pci_device { + int fd; + int group_fd; + + struct iommu *iommu; struct vfio_device_info info; struct vfio_region_info config_space; @@ -180,8 +184,6 @@ struct vfio_pci_device { struct vfio_irq_info msi_info; struct vfio_irq_info msix_info; - struct list_head dma_regions; - /* eventfds for MSI and MSI-x interrupts */ int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 4a021ff4fc40..e47f3ccf6d49 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -277,7 +277,7 @@ iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; - list_for_each_entry(region, &device->dma_regions, link) { + list_for_each_entry(region, &device->iommu->dma_regions, link) { if (vaddr < region->vaddr) continue; @@ -397,7 +397,7 @@ static int vfio_iommu_dma_map(struct vfio_pci_device *device, .size = region->size, }; - if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args)) + if (ioctl(device->iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args)) return -errno; return 0; @@ -414,10 +414,10 @@ static int iommufd_dma_map(struct vfio_pci_device *device, .user_va = (u64)region->vaddr, .iova = region->iova, .length = region->size, - .ioas_id = device->ioas_id, + .ioas_id = device->iommu->ioas_id, }; - if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args)) + if (ioctl(device->iommu->iommufd, IOMMU_IOAS_MAP, &args)) return -errno; return 0; @@ -428,7 +428,7 @@ int __vfio_pci_dma_map(struct vfio_pci_device *device, { int ret; - if (device->iommufd) + if (device->iommu->iommufd) ret = iommufd_dma_map(device, region); else ret = vfio_iommu_dma_map(device, region); @@ -436,7 +436,7 @@ int __vfio_pci_dma_map(struct vfio_pci_device *device, if (ret) return ret; - list_add(®ion->link, &device->dma_regions); + list_add(®ion->link, &device->iommu->dma_regions); return 0; } @@ -484,13 +484,14 @@ int __vfio_pci_dma_unmap(struct vfio_pci_device *device, { int ret; - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, region->iova, - region->size, device->ioas_id, + if (device->iommu->iommufd) + ret = iommufd_dma_unmap(device->iommu->iommufd, region->iova, + region->size, device->iommu->ioas_id, unmapped); else - ret = vfio_iommu_dma_unmap(device->container_fd, region->iova, - region->size, 0, unmapped); + ret = vfio_iommu_dma_unmap(device->iommu->container_fd, + region->iova, region->size, 0, + unmapped); if (ret) return ret; @@ -505,17 +506,17 @@ int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) int ret; struct vfio_dma_region *curr, *next; - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX, - device->ioas_id, unmapped); + if (device->iommu->iommufd) + ret = iommufd_dma_unmap(device->iommu->iommufd, 0, UINT64_MAX, + device->iommu->ioas_id, unmapped); else - ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0, + ret = vfio_iommu_dma_unmap(device->iommu->container_fd, 0, 0, VFIO_DMA_UNMAP_FLAG_ALL, unmapped); if (ret) return ret; - list_for_each_entry_safe(curr, next, &device->dma_regions, link) + list_for_each_entry_safe(curr, next, &device->iommu->dma_regions, link) list_del_init(&curr->link); return 0; @@ -627,28 +628,28 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_GET_STATUS, &group_status); VFIO_ASSERT_TRUE(group_status.flags & VFIO_GROUP_FLAGS_VIABLE); - ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); + ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd); } static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) { - unsigned long iommu_type = device->iommu_mode->iommu_type; - const char *path = device->iommu_mode->container_path; + unsigned long iommu_type = device->iommu->mode->iommu_type; + const char *path = device->iommu->mode->container_path; int version; int ret; - device->container_fd = open(path, O_RDWR); - VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); + device->iommu->container_fd = open(path, O_RDWR); + VFIO_ASSERT_GE(device->iommu->container_fd, 0, "open(%s) failed\n", path); - version = ioctl(device->container_fd, VFIO_GET_API_VERSION); + version = ioctl(device->iommu->container_fd, VFIO_GET_API_VERSION); VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); vfio_pci_group_setup(device, bdf); - ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); + ret = ioctl(device->iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); - ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); + ioctl_assert(device->iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); VFIO_ASSERT_GE(device->fd, 0); @@ -801,12 +802,12 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b * used to check if iommufd is enabled. In practice open() will never * return 0 unless stdin is closed. */ - device->iommufd = open("/dev/iommu", O_RDWR); - VFIO_ASSERT_GT(device->iommufd, 0); + device->iommu->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(device->iommu->iommufd, 0); - vfio_device_bind_iommufd(device->fd, device->iommufd); - device->ioas_id = iommufd_ioas_alloc(device->iommufd); - vfio_device_attach_iommufd_pt(device->fd, device->ioas_id); + vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); + device->iommu->ioas_id = iommufd_ioas_alloc(device->iommu->iommufd); + vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) @@ -816,11 +817,14 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); - INIT_LIST_HEAD(&device->dma_regions); + device->iommu = calloc(1, sizeof(*device->iommu)); + VFIO_ASSERT_NOT_NULL(device->iommu); - device->iommu_mode = lookup_iommu_mode(iommu_mode); + INIT_LIST_HEAD(&device->iommu->dma_regions); - if (device->iommu_mode->container_path) + device->iommu->mode = lookup_iommu_mode(iommu_mode); + + if (device->iommu->mode->container_path) vfio_pci_container_setup(device, bdf); else vfio_pci_iommufd_setup(device, bdf); @@ -849,13 +853,14 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); } - if (device->iommufd) { - VFIO_ASSERT_EQ(close(device->iommufd), 0); + if (device->iommu->iommufd) { + VFIO_ASSERT_EQ(close(device->iommu->iommufd), 0); } else { VFIO_ASSERT_EQ(close(device->group_fd), 0); - VFIO_ASSERT_EQ(close(device->container_fd), 0); + VFIO_ASSERT_EQ(close(device->iommu->container_fd), 0); } + free(device->iommu); free(device); } From d8470a775ccd7e2c709482193cba0ae18d17c717 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:21 +0000 Subject: [PATCH 53/65] vfio: selftests: Support multiple devices in the same container/iommufd Support tests that want to add multiple devices to the same container/iommufd by decoupling struct vfio_pci_device from struct iommu. Multi-devices tests can now put multiple devices in the same container/iommufd like so: iommu = iommu_init(iommu_mode); device1 = vfio_pci_device_init(bdf1, iommu); device2 = vfio_pci_device_init(bdf2, iommu); device3 = vfio_pci_device_init(bdf3, iommu); ... vfio_pci_device_cleanup(device3); vfio_pci_device_cleanup(device2); vfio_pci_device_cleanup(device1); iommu_cleanup(iommu); To account for the new separation of vfio_pci_device and iommu, update existing tests to initialize and cleanup a struct iommu. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-7-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 6 +- .../selftests/vfio/lib/vfio_pci_device.c | 105 +++++++++++------- .../selftests/vfio/vfio_dma_mapping_test.c | 10 +- .../selftests/vfio/vfio_pci_device_test.c | 10 +- .../selftests/vfio/vfio_pci_driver_test.c | 26 ++++- 5 files changed, 107 insertions(+), 50 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 3160f2d1ea6d..379942dc5357 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -216,8 +216,12 @@ const char *vfio_pci_get_cdev_path(const char *bdf); extern const char *default_iommu_mode; -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode); +struct iommu *iommu_init(const char *iommu_mode); +void iommu_cleanup(struct iommu *iommu); + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); void vfio_pci_device_cleanup(struct vfio_pci_device *device); + void vfio_pci_device_reset(struct vfio_pci_device *device); struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index e47f3ccf6d49..8f8fed4f384d 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -86,13 +86,13 @@ static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device .argsz = sizeof(*info), }; - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + ioctl_assert(device->iommu->container_fd, VFIO_IOMMU_GET_INFO, info); VFIO_ASSERT_GE(info->argsz, sizeof(*info)); info = realloc(info, info->argsz); VFIO_ASSERT_NOT_NULL(info); - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + ioctl_assert(device->iommu->container_fd, VFIO_IOMMU_GET_INFO, info); VFIO_ASSERT_GE(info->argsz, sizeof(*info)); return info; @@ -142,10 +142,10 @@ static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *devi struct iommu_ioas_iova_ranges query = { .size = sizeof(query), - .ioas_id = device->ioas_id, + .ioas_id = device->iommu->ioas_id, }; - ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + ret = ioctl(device->iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); VFIO_ASSERT_EQ(ret, -1); VFIO_ASSERT_EQ(errno, EMSGSIZE); VFIO_ASSERT_GT(query.num_iovas, 0); @@ -155,7 +155,7 @@ static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *devi query.allowed_iovas = (uintptr_t)ranges; - ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + ioctl_assert(device->iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); *nranges = query.num_iovas; return ranges; @@ -180,7 +180,7 @@ struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, { struct iommu_iova_range *ranges; - if (device->iommufd) + if (device->iommu->iommufd) ranges = iommufd_iova_ranges(device, nranges); else ranges = vfio_iommu_iova_ranges(device, nranges); @@ -633,23 +633,21 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) { - unsigned long iommu_type = device->iommu->mode->iommu_type; - const char *path = device->iommu->mode->container_path; - int version; + struct iommu *iommu = device->iommu; + unsigned long iommu_type = iommu->mode->iommu_type; int ret; - device->iommu->container_fd = open(path, O_RDWR); - VFIO_ASSERT_GE(device->iommu->container_fd, 0, "open(%s) failed\n", path); - - version = ioctl(device->iommu->container_fd, VFIO_GET_API_VERSION); - VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); - vfio_pci_group_setup(device, bdf); - ret = ioctl(device->iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); + ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); - ioctl_assert(device->iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); + /* + * Allow multiple threads to race to set the IOMMU type on the + * container. The first will succeed and the rest should fail + * because the IOMMU type is already set. + */ + (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); VFIO_ASSERT_GE(device->fd, 0); @@ -797,34 +795,56 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b VFIO_ASSERT_GE(device->fd, 0); free((void *)cdev_path); - /* - * Require device->iommufd to be >0 so that a simple non-0 check can be - * used to check if iommufd is enabled. In practice open() will never - * return 0 unless stdin is closed. - */ - device->iommu->iommufd = open("/dev/iommu", O_RDWR); - VFIO_ASSERT_GT(device->iommu->iommufd, 0); - vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); - device->iommu->ioas_id = iommufd_ioas_alloc(device->iommu->iommufd); vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) +struct iommu *iommu_init(const char *iommu_mode) +{ + const char *container_path; + struct iommu *iommu; + int version; + + iommu = calloc(1, sizeof(*iommu)); + VFIO_ASSERT_NOT_NULL(iommu); + + INIT_LIST_HEAD(&iommu->dma_regions); + + iommu->mode = lookup_iommu_mode(iommu_mode); + + container_path = iommu->mode->container_path; + if (container_path) { + iommu->container_fd = open(container_path, O_RDWR); + VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path); + + version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); + } else { + /* + * Require device->iommufd to be >0 so that a simple non-0 check can be + * used to check if iommufd is enabled. In practice open() will never + * return 0 unless stdin is closed. + */ + iommu->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(iommu->iommufd, 0); + + iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd); + } + + return iommu; +} + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); - device->iommu = calloc(1, sizeof(*device->iommu)); - VFIO_ASSERT_NOT_NULL(device->iommu); + VFIO_ASSERT_NOT_NULL(iommu); + device->iommu = iommu; - INIT_LIST_HEAD(&device->iommu->dma_regions); - - device->iommu->mode = lookup_iommu_mode(iommu_mode); - - if (device->iommu->mode->container_path) + if (iommu->mode->container_path) vfio_pci_container_setup(device, bdf); else vfio_pci_iommufd_setup(device, bdf); @@ -853,17 +873,22 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); } - if (device->iommu->iommufd) { - VFIO_ASSERT_EQ(close(device->iommu->iommufd), 0); - } else { + if (device->group_fd) VFIO_ASSERT_EQ(close(device->group_fd), 0); - VFIO_ASSERT_EQ(close(device->iommu->container_fd), 0); - } - free(device->iommu); free(device); } +void iommu_cleanup(struct iommu *iommu) +{ + if (iommu->iommufd) + VFIO_ASSERT_EQ(close(iommu->iommufd), 0); + else + VFIO_ASSERT_EQ(close(iommu->container_fd), 0); + + free(iommu); +} + static bool is_bdf(const char *str) { unsigned int s, b, d, f; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 102603d4407d..4727feb214c8 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -94,6 +94,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, } FIXTURE(vfio_dma_mapping_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; }; @@ -119,7 +120,8 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); self->iova_allocator = iova_allocator_init(self->device); } @@ -127,6 +129,7 @@ FIXTURE_TEARDOWN(vfio_dma_mapping_test) { iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_dma_mapping_test, dma_map_unmap) @@ -203,6 +206,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } FIXTURE(vfio_dma_map_limit_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct vfio_dma_region region; size_t mmap_size; @@ -235,7 +239,8 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) */ self->mmap_size = 2 * region_size; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); @@ -253,6 +258,7 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) FIXTURE_TEARDOWN(vfio_dma_map_limit_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0); } diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7a270698e4d2..e95217933c6b 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -23,17 +23,20 @@ static const char *device_bdf; #define MAX_TEST_MSI 16U FIXTURE(vfio_pci_device_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; FIXTURE_SETUP(vfio_pci_device_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_device_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } #define read_pci_id_from_sysfs(_file) ({ \ @@ -99,6 +102,7 @@ TEST_F(vfio_pci_device_test, validate_bars) } FIXTURE(vfio_pci_irq_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; @@ -116,12 +120,14 @@ FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) { FIXTURE_SETUP(vfio_pci_irq_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_irq_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_irq_test, enable_trigger_disable) diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index f69eec8b928d..b0c7d812de1f 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -44,6 +44,7 @@ static void region_teardown(struct vfio_pci_device *device, } FIXTURE(vfio_pci_driver_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; struct vfio_dma_region memcpy_region; @@ -73,7 +74,8 @@ FIXTURE_SETUP(vfio_pci_driver_test) { struct vfio_pci_driver *driver; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); self->iova_allocator = iova_allocator_init(self->device); driver = &self->device->driver; @@ -113,6 +115,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_driver_test, init_remove) @@ -231,18 +234,31 @@ TEST_F_TIMEOUT(vfio_pci_driver_test, memcpy_storm, 60) ASSERT_NO_MSI(self->msi_fd); } -int main(int argc, char *argv[]) +static bool device_has_selftests_driver(const char *bdf) { struct vfio_pci_device *device; + struct iommu *iommu; + bool has_driver; + iommu = iommu_init(default_iommu_mode); + device = vfio_pci_device_init(device_bdf, iommu); + + has_driver = !!device->driver.ops; + + vfio_pci_device_cleanup(device); + iommu_cleanup(iommu); + + return has_driver; +} + +int main(int argc, char *argv[]) +{ device_bdf = vfio_selftests_get_bdf(&argc, argv); - device = vfio_pci_device_init(device_bdf, default_iommu_mode); - if (!device->driver.ops) { + if (!device_has_selftests_driver(device_bdf)) { fprintf(stderr, "No driver found for device %s\n", device_bdf); return KSFT_SKIP; } - vfio_pci_device_cleanup(device); return test_harness_run(argc, argv); } From 6c74d9830d8b10d51ce3c55d38e8babf23899c89 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:22 +0000 Subject: [PATCH 54/65] vfio: selftests: Eliminate overly chatty logging Eliminate overly chatty logs that are printed during almost every test. These logs are adding more noise than value. If a test cares about this information it can log it itself. This is especially true as the VFIO selftests gains support for multiple devices in a single test (which multiplies all these logs). Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-8-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/vfio_pci_driver.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index e5e8723ecb41..abb7a62a03ea 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#include - #include "../../../kselftest.h" #include @@ -29,7 +27,6 @@ void vfio_pci_driver_probe(struct vfio_pci_device *device) if (ops->probe(device)) continue; - printf("Driver found: %s\n", ops->name); device->driver.ops = ops; } } @@ -58,17 +55,6 @@ void vfio_pci_driver_init(struct vfio_pci_device *device) driver->ops->init(device); driver->initialized = true; - - printf("%s: region: vaddr %p, iova 0x%lx, size 0x%lx\n", - driver->ops->name, - driver->region.vaddr, - driver->region.iova, - driver->region.size); - - printf("%s: max_memcpy_size 0x%lx, max_memcpy_count 0x%lx\n", - driver->ops->name, - driver->max_memcpy_size, - driver->max_memcpy_count); } void vfio_pci_driver_remove(struct vfio_pci_device *device) From c48545442e18219cca7f658ecdf7562ef58cfac6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:23 +0000 Subject: [PATCH 55/65] vfio: selftests: Prefix logs with device BDF where relevant Prefix log messages with the device's BDF where relevant. This will help understanding VFIO selftests logs when tests are run with multiple devices. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-9-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/drivers/dsa/dsa.c | 34 +++++++++---------- .../selftests/vfio/lib/drivers/ioat/ioat.c | 16 ++++----- .../selftests/vfio/lib/include/vfio_util.h | 4 +++ .../selftests/vfio/lib/vfio_pci_device.c | 1 + 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index 0ca2cbc2a316..8d667be80229 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -70,7 +70,7 @@ static int dsa_probe(struct vfio_pci_device *device) return -EINVAL; if (dsa_int_handle_request_required(device)) { - printf("Device requires requesting interrupt handles\n"); + dev_info(device, "Device requires requesting interrupt handles\n"); return -EINVAL; } @@ -91,23 +91,23 @@ static void dsa_check_sw_err(struct vfio_pci_device *device) return; } - fprintf(stderr, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + dev_err(device, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", err.bits[0], err.bits[1], err.bits[2], err.bits[3]); - fprintf(stderr, " valid: 0x%x\n", err.valid); - fprintf(stderr, " overflow: 0x%x\n", err.overflow); - fprintf(stderr, " desc_valid: 0x%x\n", err.desc_valid); - fprintf(stderr, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); - fprintf(stderr, " batch: 0x%x\n", err.batch); - fprintf(stderr, " fault_rw: 0x%x\n", err.fault_rw); - fprintf(stderr, " priv: 0x%x\n", err.priv); - fprintf(stderr, " error: 0x%x\n", err.error); - fprintf(stderr, " wq_idx: 0x%x\n", err.wq_idx); - fprintf(stderr, " operation: 0x%x\n", err.operation); - fprintf(stderr, " pasid: 0x%x\n", err.pasid); - fprintf(stderr, " batch_idx: 0x%x\n", err.batch_idx); - fprintf(stderr, " invalid_flags: 0x%x\n", err.invalid_flags); - fprintf(stderr, " fault_addr: 0x%lx\n", err.fault_addr); + dev_err(device, " valid: 0x%x\n", err.valid); + dev_err(device, " overflow: 0x%x\n", err.overflow); + dev_err(device, " desc_valid: 0x%x\n", err.desc_valid); + dev_err(device, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); + dev_err(device, " batch: 0x%x\n", err.batch); + dev_err(device, " fault_rw: 0x%x\n", err.fault_rw); + dev_err(device, " priv: 0x%x\n", err.priv); + dev_err(device, " error: 0x%x\n", err.error); + dev_err(device, " wq_idx: 0x%x\n", err.wq_idx); + dev_err(device, " operation: 0x%x\n", err.operation); + dev_err(device, " pasid: 0x%x\n", err.pasid); + dev_err(device, " batch_idx: 0x%x\n", err.batch_idx); + dev_err(device, " invalid_flags: 0x%x\n", err.invalid_flags); + dev_err(device, " fault_addr: 0x%lx\n", err.fault_addr); VFIO_FAIL("Software Error Detected!\n"); } @@ -256,7 +256,7 @@ static int dsa_completion_wait(struct vfio_pci_device *device, if (status == DSA_COMP_SUCCESS) return 0; - printf("Error detected during memcpy operation: 0x%x\n", status); + dev_info(device, "Error detected during memcpy operation: 0x%x\n", status); return -1; } diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c index c3b91d9b1f59..e04dce1d544c 100644 --- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -51,7 +51,7 @@ static int ioat_probe(struct vfio_pci_device *device) r = 0; break; default: - printf("ioat: Unsupported version: 0x%x\n", version); + dev_info(device, "ioat: Unsupported version: 0x%x\n", version); r = -EINVAL; } return r; @@ -135,13 +135,13 @@ static void ioat_handle_error(struct vfio_pci_device *device) { void *registers = ioat_channel_registers(device); - printf("Error detected during memcpy operation!\n" - " CHANERR: 0x%x\n" - " CHANERR_INT: 0x%x\n" - " DMAUNCERRSTS: 0x%x\n", - readl(registers + IOAT_CHANERR_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); + dev_info(device, "Error detected during memcpy operation!\n" + " CHANERR: 0x%x\n" + " CHANERR_INT: 0x%x\n" + " DMAUNCERRSTS: 0x%x\n", + readl(registers + IOAT_CHANERR_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); ioat_reset(device); } diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 379942dc5357..babbf90688e8 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -50,6 +50,9 @@ VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ } while (0) +#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) + struct iommu_mode { const char *name; const char *container_path; @@ -172,6 +175,7 @@ struct iommu { }; struct vfio_pci_device { + const char *bdf; int fd; int group_fd; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 8f8fed4f384d..f0cf18d3de55 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -843,6 +843,7 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iomm VFIO_ASSERT_NOT_NULL(iommu); device->iommu = iommu; + device->bdf = bdf; if (iommu->mode->container_path) vfio_pci_container_setup(device, bdf); From 28a84da744dd647d07b784623ddd671c00371297 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:24 +0000 Subject: [PATCH 56/65] vfio: selftests: Upgrade driver logging to dev_err() Upgrade various logging in the VFIO selftests drivers from dev_info() to dev_err(). All of these logs indicate scenarios that may be unexpected. For example, the logging during probing indicates matching devices but that aren't supported by the driver. And the memcpy errors can indicate a problem if the caller was not trying to do something like exercise I/O fault handling. Exercising I/O fault handling is certainly a valid thing to do, but the driver can't infer the caller's expectations, so better to just log with dev_err(). Suggested-by: Raghavendra Rao Ananta Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-10-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c | 4 ++-- tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index 8d667be80229..0afbab0d82de 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -70,7 +70,7 @@ static int dsa_probe(struct vfio_pci_device *device) return -EINVAL; if (dsa_int_handle_request_required(device)) { - dev_info(device, "Device requires requesting interrupt handles\n"); + dev_err(device, "Device requires requesting interrupt handles\n"); return -EINVAL; } @@ -256,7 +256,7 @@ static int dsa_completion_wait(struct vfio_pci_device *device, if (status == DSA_COMP_SUCCESS) return 0; - dev_info(device, "Error detected during memcpy operation: 0x%x\n", status); + dev_err(device, "Error detected during memcpy operation: 0x%x\n", status); return -1; } diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c index e04dce1d544c..c6db311ce64d 100644 --- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -51,7 +51,7 @@ static int ioat_probe(struct vfio_pci_device *device) r = 0; break; default: - dev_info(device, "ioat: Unsupported version: 0x%x\n", version); + dev_err(device, "ioat: Unsupported version: 0x%x\n", version); r = -EINVAL; } return r; @@ -135,7 +135,7 @@ static void ioat_handle_error(struct vfio_pci_device *device) { void *registers = ioat_channel_registers(device); - dev_info(device, "Error detected during memcpy operation!\n" + dev_err(device, "Error detected during memcpy operation!\n" " CHANERR: 0x%x\n" " CHANERR_INT: 0x%x\n" " DMAUNCERRSTS: 0x%x\n", From 9a659d74f2a455cbc595c6cb4dfefc16b09f86ed Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:25 +0000 Subject: [PATCH 57/65] vfio: selftests: Rename struct vfio_dma_region to dma_region Rename struct vfio_dma_region to dma_region. This is in preparation for separating the VFIO PCI device library code from the IOMMU library code. This name change also better reflects the fact that DMA mappings can be managed by either VFIO or IOMMUFD. i.e. the "vfio_" prefix is misleading. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-11-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 12 ++++++------ tools/testing/selftests/vfio/lib/vfio_pci_device.c | 12 ++++++------ tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 12 ++++++------ tools/testing/selftests/vfio/vfio_pci_driver_test.c | 6 +++--- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index babbf90688e8..7784422116de 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -80,7 +80,7 @@ typedef u64 iova_t; #define INVALID_IOVA UINT64_MAX -struct vfio_dma_region { +struct dma_region { struct list_head link; void *vaddr; iova_t iova; @@ -154,7 +154,7 @@ struct vfio_pci_driver { bool memcpy_in_progress; /* Region to be used by the driver (e.g. for in-memory descriptors) */ - struct vfio_dma_region region; + struct dma_region region; /* The maximum size that can be passed to memcpy_start(). */ u64 max_memcpy_size; @@ -236,20 +236,20 @@ void iova_allocator_cleanup(struct iova_allocator *allocator); iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region); + struct dma_region *region); int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, + struct dma_region *region, u64 *unmapped); int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped); static inline void vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); } static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); } diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index f0cf18d3de55..422ad8dfad95 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -275,7 +275,7 @@ iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { - struct vfio_dma_region *region; + struct dma_region *region; list_for_each_entry(region, &device->iommu->dma_regions, link) { if (vaddr < region->vaddr) @@ -387,7 +387,7 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, } static int vfio_iommu_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { struct vfio_iommu_type1_dma_map args = { .argsz = sizeof(args), @@ -404,7 +404,7 @@ static int vfio_iommu_dma_map(struct vfio_pci_device *device, } static int iommufd_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { struct iommu_ioas_map args = { .size = sizeof(args), @@ -424,7 +424,7 @@ static int iommufd_dma_map(struct vfio_pci_device *device, } int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { int ret; @@ -480,7 +480,7 @@ static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id, } int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, u64 *unmapped) + struct dma_region *region, u64 *unmapped) { int ret; @@ -504,7 +504,7 @@ int __vfio_pci_dma_unmap(struct vfio_pci_device *device, int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) { int ret; - struct vfio_dma_region *curr, *next; + struct dma_region *curr, *next; if (device->iommu->iommufd) ret = iommufd_dma_unmap(device->iommu->iommufd, 0, UINT64_MAX, diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4727feb214c8..289af4665803 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -136,7 +136,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) { const u64 size = variant->size ?: getpagesize(); const int flags = variant->mmap_flags; - struct vfio_dma_region region; + struct dma_region region; struct iommu_mapping mapping; u64 mapping_size = size; u64 unmapped; @@ -208,7 +208,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) FIXTURE(vfio_dma_map_limit_test) { struct iommu *iommu; struct vfio_pci_device *device; - struct vfio_dma_region region; + struct dma_region region; size_t mmap_size; }; @@ -227,7 +227,7 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; struct iommu_iova_range *ranges; u64 region_size = getpagesize(); iova_t last_iova; @@ -264,7 +264,7 @@ FIXTURE_TEARDOWN(vfio_dma_map_limit_test) TEST_F(vfio_dma_map_limit_test, unmap_range) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; @@ -278,7 +278,7 @@ TEST_F(vfio_dma_map_limit_test, unmap_range) TEST_F(vfio_dma_map_limit_test, unmap_all) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; @@ -292,7 +292,7 @@ TEST_F(vfio_dma_map_limit_test, unmap_all) TEST_F(vfio_dma_map_limit_test, overflow) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; int rc; region->iova = ~(iova_t)0 & ~(region->size - 1); diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index b0c7d812de1f..057aa9bbe13e 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -20,7 +20,7 @@ static const char *device_bdf; static void region_setup(struct vfio_pci_device *device, struct iova_allocator *iova_allocator, - struct vfio_dma_region *region, u64 size) + struct dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; const int prot = PROT_READ | PROT_WRITE; @@ -37,7 +37,7 @@ static void region_setup(struct vfio_pci_device *device, } static void region_teardown(struct vfio_pci_device *device, - struct vfio_dma_region *region) + struct dma_region *region) { vfio_pci_dma_unmap(device, region); VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0); @@ -47,7 +47,7 @@ FIXTURE(vfio_pci_driver_test) { struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; - struct vfio_dma_region memcpy_region; + struct dma_region memcpy_region; void *vaddr; int msi_fd; From 2aca571089b2cc333ed5ad0ada4902aa6f7c0d3c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:26 +0000 Subject: [PATCH 58/65] vfio: selftests: Move IOMMU library code into iommu.c Move all the IOMMU related library code into their own file iommu.c. This provides a better separation between the vfio_pci_device helper code and the iommu code. No function change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-12-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 74 ++- tools/testing/selftests/vfio/lib/iommu.c | 461 ++++++++++++++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 3 +- .../selftests/vfio/lib/vfio_pci_device.c | 442 ----------------- 4 files changed, 527 insertions(+), 453 deletions(-) create mode 100644 tools/testing/selftests/vfio/lib/iommu.c diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 7784422116de..f67915d9443e 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -50,6 +50,12 @@ VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ } while (0) +#define ioctl_assert(_fd, _op, _arg) do { \ + void *__arg = (_arg); \ + int __ret = ioctl((_fd), (_op), (__arg)); \ + VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ +} while (0) + #define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) #define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) @@ -223,24 +229,52 @@ extern const char *default_iommu_mode; struct iommu *iommu_init(const char *iommu_mode); void iommu_cleanup(struct iommu *iommu); +int __iommu_map(struct iommu *iommu, struct dma_region *region); + +static inline void iommu_map(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped); + +static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0); +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped); + +static inline void iommu_unmap_all(struct iommu *iommu) +{ + VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); +} + +iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr); +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); + +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); + struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges); +static inline struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + return iommu_iova_ranges(device->iommu, nranges); +} struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); void iova_allocator_cleanup(struct iova_allocator *allocator); iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct dma_region *region); -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct dma_region *region, - u64 *unmapped); -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped); +static inline int __vfio_pci_dma_map(struct vfio_pci_device *device, + struct dma_region *region) +{ + return __iommu_map(device->iommu, region); +} static inline void vfio_pci_dma_map(struct vfio_pci_device *device, struct dma_region *region) @@ -248,12 +282,25 @@ static inline void vfio_pci_dma_map(struct vfio_pci_device *device, VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); } +static inline int __vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct dma_region *region, + u64 *unmapped) +{ + return __iommu_unmap(device->iommu, region, unmapped); +} + static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, struct dma_region *region) { VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); } +static inline int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, + u64 *unmapped) +{ + return __iommu_unmap_all(device->iommu, unmapped); +} + static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device) { VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0); @@ -319,8 +366,15 @@ static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); } -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr); -iova_t to_iova(struct vfio_pci_device *device, void *vaddr); +static inline iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return __iommu_hva2iova(device->iommu, vaddr); +} + +static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return iommu_hva2iova(device->iommu, vaddr); +} static inline bool vfio_pci_device_match(struct vfio_pci_device *device, u16 vendor_id, u16 device_id) diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c new file mode 100644 index 000000000000..3933079fc419 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../../../kselftest.h" +#include + +const char *default_iommu_mode = "iommufd"; + +/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ +static const struct iommu_mode iommu_modes[] = { + { + .name = "vfio_type1_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "vfio_type1v2_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd_compat_type1", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "iommufd_compat_type1v2", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd", + }, +}; + +static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) +{ + int i; + + if (!iommu_mode) + iommu_mode = default_iommu_mode; + + for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { + if (strcmp(iommu_mode, iommu_modes[i].name)) + continue; + + return &iommu_modes[i]; + } + + VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); +} + +iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr) +{ + struct dma_region *region; + + list_for_each_entry(region, &iommu->dma_regions, link) { + if (vaddr < region->vaddr) + continue; + + if (vaddr >= region->vaddr + region->size) + continue; + + return region->iova + (vaddr - region->vaddr); + } + + return INVALID_IOVA; +} + +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr) +{ + iova_t iova; + + iova = __iommu_hva2iova(iommu, vaddr); + VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into IOMMU\n", vaddr); + + return iova; +} + +static int vfio_iommu_map(struct iommu *iommu, struct dma_region *region) +{ + struct vfio_iommu_type1_dma_map args = { + .argsz = sizeof(args), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (u64)region->vaddr, + .iova = region->iova, + .size = region->size, + }; + + if (ioctl(iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args)) + return -errno; + + return 0; +} + +static int iommufd_map(struct iommu *iommu, struct dma_region *region) +{ + struct iommu_ioas_map args = { + .size = sizeof(args), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .user_va = (u64)region->vaddr, + .iova = region->iova, + .length = region->size, + .ioas_id = iommu->ioas_id, + }; + + if (ioctl(iommu->iommufd, IOMMU_IOAS_MAP, &args)) + return -errno; + + return 0; +} + +int __iommu_map(struct iommu *iommu, struct dma_region *region) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_map(iommu, region); + else + ret = vfio_iommu_map(iommu, region); + + if (ret) + return ret; + + list_add(®ion->link, &iommu->dma_regions); + + return 0; +} + +static int __vfio_iommu_unmap(int fd, u64 iova, u64 size, u32 flags, u64 *unmapped) +{ + struct vfio_iommu_type1_dma_unmap args = { + .argsz = sizeof(args), + .iova = iova, + .size = size, + .flags = flags, + }; + + if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) + return -errno; + + if (unmapped) + *unmapped = args.size; + + return 0; +} + +static int vfio_iommu_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __vfio_iommu_unmap(iommu->container_fd, region->iova, + region->size, 0, unmapped); +} + +static int __iommufd_unmap(int fd, u64 iova, u64 length, u32 ioas_id, u64 *unmapped) +{ + struct iommu_ioas_unmap args = { + .size = sizeof(args), + .iova = iova, + .length = length, + .ioas_id = ioas_id, + }; + + if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) + return -errno; + + if (unmapped) + *unmapped = args.length; + + return 0; +} + +static int iommufd_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __iommufd_unmap(iommu->iommufd, region->iova, region->size, + iommu->ioas_id, unmapped); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_unmap(iommu, region, unmapped); + else + ret = vfio_iommu_unmap(iommu, region, unmapped); + + if (ret) + return ret; + + list_del_init(®ion->link); + + return 0; +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped) +{ + int ret; + struct dma_region *curr, *next; + + if (iommu->iommufd) + ret = __iommufd_unmap(iommu->iommufd, 0, UINT64_MAX, + iommu->ioas_id, unmapped); + else + ret = __vfio_iommu_unmap(iommu->container_fd, 0, 0, + VFIO_DMA_UNMAP_FLAG_ALL, unmapped); + + if (ret) + return ret; + + list_for_each_entry_safe(curr, next, &iommu->dma_regions, link) + list_del_init(&curr->link); + + return 0; +} + +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(int container_fd) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(iommu->container_fd); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = iommu->ioas_id, + }; + + ret = ioctl(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (iommu->iommufd) + ranges = iommufd_iova_ranges(iommu, nranges); + else + ranges = vfio_iommu_iova_ranges(iommu, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +static u32 iommufd_ioas_alloc(int iommufd) +{ + struct iommu_ioas_alloc args = { + .size = sizeof(args), + }; + + ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); + return args.out_ioas_id; +} + +struct iommu *iommu_init(const char *iommu_mode) +{ + const char *container_path; + struct iommu *iommu; + int version; + + iommu = calloc(1, sizeof(*iommu)); + VFIO_ASSERT_NOT_NULL(iommu); + + INIT_LIST_HEAD(&iommu->dma_regions); + + iommu->mode = lookup_iommu_mode(iommu_mode); + + container_path = iommu->mode->container_path; + if (container_path) { + iommu->container_fd = open(container_path, O_RDWR); + VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path); + + version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); + } else { + /* + * Require device->iommufd to be >0 so that a simple non-0 check can be + * used to check if iommufd is enabled. In practice open() will never + * return 0 unless stdin is closed. + */ + iommu->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(iommu->iommufd, 0); + + iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd); + } + + return iommu; +} + +void iommu_cleanup(struct iommu *iommu) +{ + if (iommu->iommufd) + VFIO_ASSERT_EQ(close(iommu->iommufd), 0); + else + VFIO_ASSERT_EQ(close(iommu->container_fd), 0); + + free(iommu); +} diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 3c0cdac30cb6..7ecf2ad75c67 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -3,7 +3,8 @@ ARCH ?= $(SUBARCH) LIBVFIO_SRCDIR := $(selfdir)/vfio/lib -LIBVFIO_C := vfio_pci_device.c +LIBVFIO_C := iommu.c +LIBVFIO_C += vfio_pci_device.c LIBVFIO_C += vfio_pci_driver.c ifeq ($(ARCH:x86_64=x86),x86) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 422ad8dfad95..10fc016a2a3e 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -24,184 +24,6 @@ #define PCI_SYSFS_PATH "/sys/bus/pci/devices" -#define ioctl_assert(_fd, _op, _arg) do { \ - void *__arg = (_arg); \ - int __ret = ioctl((_fd), (_op), (__arg)); \ - VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ -} while (0) - -static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, - u32 *cap_offset) -{ - struct vfio_info_cap_header *hdr; - - if (!*cap_offset) - return NULL; - - VFIO_ASSERT_LT(*cap_offset, bufsz); - VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); - - hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); - *cap_offset = hdr->next; - - return hdr; -} - -static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, - u16 cap_id) -{ - struct vfio_info_cap_header *hdr; - u32 cap_offset = info->cap_offset; - u32 max_depth; - u32 depth = 0; - - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) - return NULL; - - if (cap_offset) - VFIO_ASSERT_GE(cap_offset, sizeof(*info)); - - max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); - - while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { - depth++; - VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); - - if (hdr->id == cap_id) - return hdr; - } - - return NULL; -} - -/* Return buffer including capability chain, if present. Free with free() */ -static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) -{ - struct vfio_iommu_type1_info *info; - - info = malloc(sizeof(*info)); - VFIO_ASSERT_NOT_NULL(info); - - *info = (struct vfio_iommu_type1_info) { - .argsz = sizeof(*info), - }; - - ioctl_assert(device->iommu->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - info = realloc(info, info->argsz); - VFIO_ASSERT_NOT_NULL(info); - - ioctl_assert(device->iommu->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - return info; -} - -/* - * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to - * report iommufd's iommu_iova_range. Free with free(). - */ -static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct vfio_iommu_type1_info_cap_iova_range *cap_range; - struct vfio_iommu_type1_info *info; - struct vfio_info_cap_header *hdr; - struct iommu_iova_range *ranges = NULL; - - info = vfio_iommu_get_info(device); - hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); - VFIO_ASSERT_NOT_NULL(hdr); - - cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); - VFIO_ASSERT_GT(cap_range->nr_iovas, 0); - - ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - for (u32 i = 0; i < cap_range->nr_iovas; i++) { - ranges[i] = (struct iommu_iova_range){ - .start = cap_range->iova_ranges[i].start, - .last = cap_range->iova_ranges[i].end, - }; - } - - *nranges = cap_range->nr_iovas; - - free(info); - return ranges; -} - -/* Return iova ranges of the device's IOAS. Free with free() */ -static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - int ret; - - struct iommu_ioas_iova_ranges query = { - .size = sizeof(query), - .ioas_id = device->iommu->ioas_id, - }; - - ret = ioctl(device->iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - VFIO_ASSERT_EQ(ret, -1); - VFIO_ASSERT_EQ(errno, EMSGSIZE); - VFIO_ASSERT_GT(query.num_iovas, 0); - - ranges = calloc(query.num_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - query.allowed_iovas = (uintptr_t)ranges; - - ioctl_assert(device->iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - *nranges = query.num_iovas; - - return ranges; -} - -static int iova_range_comp(const void *a, const void *b) -{ - const struct iommu_iova_range *ra = a, *rb = b; - - if (ra->start < rb->start) - return -1; - - if (ra->start > rb->start) - return 1; - - return 0; -} - -/* Return sorted IOVA ranges of the device. Free with free(). */ -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - - if (device->iommu->iommufd) - ranges = iommufd_iova_ranges(device, nranges); - else - ranges = vfio_iommu_iova_ranges(device, nranges); - - if (!ranges) - return NULL; - - VFIO_ASSERT_GT(*nranges, 0); - - /* Sort and check that ranges are sane and non-overlapping */ - qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); - VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); - - for (u32 i = 1; i < *nranges; i++) { - VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); - VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); - } - - return ranges; -} - struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) { struct iova_allocator *allocator; @@ -273,33 +95,6 @@ iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) } } -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) -{ - struct dma_region *region; - - list_for_each_entry(region, &device->iommu->dma_regions, link) { - if (vaddr < region->vaddr) - continue; - - if (vaddr >= region->vaddr + region->size) - continue; - - return region->iova + (vaddr - region->vaddr); - } - - return INVALID_IOVA; -} - -iova_t to_iova(struct vfio_pci_device *device, void *vaddr) -{ - iova_t iova; - - iova = __to_iova(device, vaddr); - VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into device.\n", vaddr); - - return iova; -} - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -386,142 +181,6 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -static int vfio_iommu_dma_map(struct vfio_pci_device *device, - struct dma_region *region) -{ - struct vfio_iommu_type1_dma_map args = { - .argsz = sizeof(args), - .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, - .vaddr = (u64)region->vaddr, - .iova = region->iova, - .size = region->size, - }; - - if (ioctl(device->iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args)) - return -errno; - - return 0; -} - -static int iommufd_dma_map(struct vfio_pci_device *device, - struct dma_region *region) -{ - struct iommu_ioas_map args = { - .size = sizeof(args), - .flags = IOMMU_IOAS_MAP_READABLE | - IOMMU_IOAS_MAP_WRITEABLE | - IOMMU_IOAS_MAP_FIXED_IOVA, - .user_va = (u64)region->vaddr, - .iova = region->iova, - .length = region->size, - .ioas_id = device->iommu->ioas_id, - }; - - if (ioctl(device->iommu->iommufd, IOMMU_IOAS_MAP, &args)) - return -errno; - - return 0; -} - -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct dma_region *region) -{ - int ret; - - if (device->iommu->iommufd) - ret = iommufd_dma_map(device, region); - else - ret = vfio_iommu_dma_map(device, region); - - if (ret) - return ret; - - list_add(®ion->link, &device->iommu->dma_regions); - - return 0; -} - -static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags, - u64 *unmapped) -{ - struct vfio_iommu_type1_dma_unmap args = { - .argsz = sizeof(args), - .iova = iova, - .size = size, - .flags = flags, - }; - - if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) - return -errno; - - if (unmapped) - *unmapped = args.size; - - return 0; -} - -static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id, - u64 *unmapped) -{ - struct iommu_ioas_unmap args = { - .size = sizeof(args), - .iova = iova, - .length = length, - .ioas_id = ioas_id, - }; - - if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) - return -errno; - - if (unmapped) - *unmapped = args.length; - - return 0; -} - -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct dma_region *region, u64 *unmapped) -{ - int ret; - - if (device->iommu->iommufd) - ret = iommufd_dma_unmap(device->iommu->iommufd, region->iova, - region->size, device->iommu->ioas_id, - unmapped); - else - ret = vfio_iommu_dma_unmap(device->iommu->container_fd, - region->iova, region->size, 0, - unmapped); - - if (ret) - return ret; - - list_del_init(®ion->link); - - return 0; -} - -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) -{ - int ret; - struct dma_region *curr, *next; - - if (device->iommu->iommufd) - ret = iommufd_dma_unmap(device->iommu->iommufd, 0, UINT64_MAX, - device->iommu->ioas_id, unmapped); - else - ret = vfio_iommu_dma_unmap(device->iommu->container_fd, 0, 0, - VFIO_DMA_UNMAP_FLAG_ALL, unmapped); - - if (ret) - return ret; - - list_for_each_entry_safe(curr, next, &device->iommu->dma_regions, link) - list_del_init(&curr->link); - - return 0; -} - static void vfio_pci_region_get(struct vfio_pci_device *device, int index, struct vfio_region_info *info) { @@ -711,52 +370,6 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ -static const struct iommu_mode iommu_modes[] = { - { - .name = "vfio_type1_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "vfio_type1v2_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd_compat_type1", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "iommufd_compat_type1v2", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd", - }, -}; - -const char *default_iommu_mode = "iommufd"; - -static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) -{ - int i; - - if (!iommu_mode) - iommu_mode = default_iommu_mode; - - for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { - if (strcmp(iommu_mode, iommu_modes[i].name)) - continue; - - return &iommu_modes[i]; - } - - VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); -} - static void vfio_device_bind_iommufd(int device_fd, int iommufd) { struct vfio_device_bind_iommufd args = { @@ -767,16 +380,6 @@ static void vfio_device_bind_iommufd(int device_fd, int iommufd) ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); } -static u32 iommufd_ioas_alloc(int iommufd) -{ - struct iommu_ioas_alloc args = { - .size = sizeof(args), - }; - - ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); - return args.out_ioas_id; -} - static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) { struct vfio_device_attach_iommufd_pt args = { @@ -799,41 +402,6 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct iommu *iommu_init(const char *iommu_mode) -{ - const char *container_path; - struct iommu *iommu; - int version; - - iommu = calloc(1, sizeof(*iommu)); - VFIO_ASSERT_NOT_NULL(iommu); - - INIT_LIST_HEAD(&iommu->dma_regions); - - iommu->mode = lookup_iommu_mode(iommu_mode); - - container_path = iommu->mode->container_path; - if (container_path) { - iommu->container_fd = open(container_path, O_RDWR); - VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path); - - version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION); - VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); - } else { - /* - * Require device->iommufd to be >0 so that a simple non-0 check can be - * used to check if iommufd is enabled. In practice open() will never - * return 0 unless stdin is closed. - */ - iommu->iommufd = open("/dev/iommu", O_RDWR); - VFIO_ASSERT_GT(iommu->iommufd, 0); - - iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd); - } - - return iommu; -} - struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; @@ -880,16 +448,6 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) free(device); } -void iommu_cleanup(struct iommu *iommu) -{ - if (iommu->iommufd) - VFIO_ASSERT_EQ(close(iommu->iommufd), 0); - else - VFIO_ASSERT_EQ(close(iommu->container_fd), 0); - - free(iommu); -} - static bool is_bdf(const char *str) { unsigned int s, b, d, f; From 2607a4361312bf407931597423fe23db406ca8d3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:27 +0000 Subject: [PATCH 59/65] vfio: selftests: Move IOVA allocator into iova_allocator.c Move the IOVA allocator into its own file, to provide better separation between the allocator and the struct vfio_pci_device helper code. The allocator could go into iommu.c, but it is standalone enough that a separate file seems cleaner. This also continues the trend of having a .c for every major object in VFIO selftests (vfio_pci_device.c, vfio_pci_driver.c, iommu.c, and now iova_allocator.c). No functional change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-13-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/iova_allocator.c | 94 +++++++++++++++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 1 + .../selftests/vfio/lib/vfio_pci_device.c | 71 -------------- 3 files changed, 95 insertions(+), 71 deletions(-) create mode 100644 tools/testing/selftests/vfio/lib/iova_allocator.c diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c new file mode 100644 index 000000000000..f03648361ba2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iova_allocator.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = vfio_pci_iova_ranges(device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 7ecf2ad75c67..f15b966877e9 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -4,6 +4,7 @@ ARCH ?= $(SUBARCH) LIBVFIO_SRCDIR := $(selfdir)/vfio/lib LIBVFIO_C := iommu.c +LIBVFIO_C += iova_allocator.c LIBVFIO_C += vfio_pci_device.c LIBVFIO_C += vfio_pci_driver.c diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 10fc016a2a3e..aa5b45052c77 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -24,77 +24,6 @@ #define PCI_SYSFS_PATH "/sys/bus/pci/devices" -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) -{ - struct iova_allocator *allocator; - struct iommu_iova_range *ranges; - u32 nranges; - - ranges = vfio_pci_iova_ranges(device, &nranges); - VFIO_ASSERT_NOT_NULL(ranges); - - allocator = malloc(sizeof(*allocator)); - VFIO_ASSERT_NOT_NULL(allocator); - - *allocator = (struct iova_allocator){ - .ranges = ranges, - .nranges = nranges, - .range_idx = 0, - .range_offset = 0, - }; - - return allocator; -} - -void iova_allocator_cleanup(struct iova_allocator *allocator) -{ - free(allocator->ranges); - free(allocator); -} - -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) -{ - VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); - VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); - - for (;;) { - struct iommu_iova_range *range; - iova_t iova, last; - - VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, - "IOVA allocator out of space\n"); - - range = &allocator->ranges[allocator->range_idx]; - iova = range->start + allocator->range_offset; - - /* Check for sufficient space at the current offset */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - /* Align iova to size */ - iova = last & ~(size - 1); - - /* Check for sufficient space at the aligned iova */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - if (last == range->last) { - allocator->range_idx++; - allocator->range_offset = 0; - } else { - allocator->range_offset = last - range->start + 1; - } - - return iova; - -next_range: - allocator->range_idx++; - allocator->range_offset = 0; - } -} - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { From 831c37a5bf046bbdf65c042b53fd2e8cac0b1a5a Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:28 +0000 Subject: [PATCH 60/65] vfio: selftests: Stop passing device for IOMMU operations Drop the struct vfio_pci_device wrappers for IOMMU map/unmap functions and require tests to directly call iommu_map(), iommu_unmap(), etc. This results in more concise code, and also makes it clear the map operations are happening on a struct iommu, not necessarily on a specific device, especially when multi-device tests are introduced. Do the same for iova_allocator_init() as that function only needs the struct iommu, not struct vfio_pci_device. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-14-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 44 +------------------ .../selftests/vfio/lib/iova_allocator.c | 4 +- .../selftests/vfio/vfio_dma_mapping_test.c | 20 ++++----- .../selftests/vfio/vfio_pci_driver_test.c | 19 ++++---- 4 files changed, 22 insertions(+), 65 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index f67915d9443e..5224808201fe 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -260,52 +260,10 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); -static inline struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - return iommu_iova_ranges(device->iommu, nranges); -} - -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); +struct iova_allocator *iova_allocator_init(struct iommu *iommu); void iova_allocator_cleanup(struct iova_allocator *allocator); iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); -static inline int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct dma_region *region) -{ - return __iommu_map(device->iommu, region); -} - -static inline void vfio_pci_dma_map(struct vfio_pci_device *device, - struct dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); -} - -static inline int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct dma_region *region, - u64 *unmapped) -{ - return __iommu_unmap(device->iommu, region, unmapped); -} - -static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); -} - -static inline int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, - u64 *unmapped) -{ - return __iommu_unmap_all(device->iommu, unmapped); -} - -static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0); -} - void vfio_pci_config_access(struct vfio_pci_device *device, bool write, size_t config, size_t size, void *data); diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c index f03648361ba2..b3b6b27f5d1e 100644 --- a/tools/testing/selftests/vfio/lib/iova_allocator.c +++ b/tools/testing/selftests/vfio/lib/iova_allocator.c @@ -21,13 +21,13 @@ #include -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +struct iova_allocator *iova_allocator_init(struct iommu *iommu) { struct iova_allocator *allocator; struct iommu_iova_range *ranges; u32 nranges; - ranges = vfio_pci_iova_ranges(device, &nranges); + ranges = iommu_iova_ranges(iommu, &nranges); VFIO_ASSERT_NOT_NULL(ranges); allocator = malloc(sizeof(*allocator)); diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 289af4665803..c4c2fc36c7b3 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -122,7 +122,7 @@ FIXTURE_SETUP(vfio_dma_mapping_test) { self->iommu = iommu_init(variant->iommu_mode); self->device = vfio_pci_device_init(device_bdf, self->iommu); - self->iova_allocator = iova_allocator_init(self->device); + self->iova_allocator = iova_allocator_init(self->iommu); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) @@ -153,7 +153,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; - vfio_pci_dma_map(self->device, ®ion); + iommu_map(self->iommu, ®ion); printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova); ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr)); @@ -195,7 +195,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } unmap: - rc = __vfio_pci_dma_unmap(self->device, ®ion, &unmapped); + rc = __iommu_unmap(self->iommu, ®ion, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region.size); printf("Unmapped IOVA 0x%lx\n", region.iova); @@ -245,7 +245,7 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - ranges = vfio_pci_iova_ranges(self->device, &nranges); + ranges = iommu_iova_ranges(self->iommu, &nranges); VFIO_ASSERT_NOT_NULL(ranges); last_iova = ranges[nranges - 1].last; free(ranges); @@ -268,10 +268,10 @@ TEST_F(vfio_dma_map_limit_test, unmap_range) u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap(self->device, region, &unmapped); + rc = __iommu_unmap(self->iommu, region, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } @@ -282,10 +282,10 @@ TEST_F(vfio_dma_map_limit_test, unmap_all) u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap_all(self->device, &unmapped); + rc = __iommu_unmap_all(self->iommu, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } @@ -298,10 +298,10 @@ TEST_F(vfio_dma_map_limit_test, overflow) region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; - rc = __vfio_pci_dma_map(self->device, region); + rc = __iommu_map(self->iommu, region); ASSERT_EQ(rc, -EOVERFLOW); - rc = __vfio_pci_dma_unmap(self->device, region, NULL); + rc = __iommu_unmap(self->iommu, region, NULL); ASSERT_EQ(rc, -EOVERFLOW); } diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 057aa9bbe13e..229e932a06f8 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -18,7 +18,7 @@ static const char *device_bdf; ASSERT_EQ(EAGAIN, errno); \ } while (0) -static void region_setup(struct vfio_pci_device *device, +static void region_setup(struct iommu *iommu, struct iova_allocator *iova_allocator, struct dma_region *region, u64 size) { @@ -33,13 +33,12 @@ static void region_setup(struct vfio_pci_device *device, region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; - vfio_pci_dma_map(device, region); + iommu_map(iommu, region); } -static void region_teardown(struct vfio_pci_device *device, - struct dma_region *region) +static void region_teardown(struct iommu *iommu, struct dma_region *region) { - vfio_pci_dma_unmap(device, region); + iommu_unmap(iommu, region); VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0); } @@ -76,12 +75,12 @@ FIXTURE_SETUP(vfio_pci_driver_test) self->iommu = iommu_init(variant->iommu_mode); self->device = vfio_pci_device_init(device_bdf, self->iommu); - self->iova_allocator = iova_allocator_init(self->device); + self->iova_allocator = iova_allocator_init(self->iommu); driver = &self->device->driver; - region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); - region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); + region_setup(self->iommu, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->iommu, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); @@ -110,8 +109,8 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) vfio_pci_driver_remove(self->device); - region_teardown(self->device, &self->memcpy_region); - region_teardown(self->device, &driver->region); + region_teardown(self->iommu, &self->memcpy_region); + region_teardown(self->iommu, &driver->region); iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); From 657d241e695fa7afa091c77d54ff2df703ece35d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:29 +0000 Subject: [PATCH 61/65] vfio: selftests: Rename vfio_util.h to libvfio.h Rename vfio_util.h to libvfio.h to match the name of libvfio.mk. No functional change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-15-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c | 2 +- tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c | 2 +- .../selftests/vfio/lib/include/{vfio_util.h => libvfio.h} | 6 +++--- tools/testing/selftests/vfio/lib/iommu.c | 2 +- tools/testing/selftests/vfio/lib/iova_allocator.c | 2 +- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 2 +- tools/testing/selftests/vfio/lib/vfio_pci_driver.c | 2 +- tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 2 +- tools/testing/selftests/vfio/vfio_iommufd_setup_test.c | 2 +- tools/testing/selftests/vfio/vfio_pci_device_test.c | 2 +- tools/testing/selftests/vfio/vfio_pci_driver_test.c | 2 +- 11 files changed, 13 insertions(+), 13 deletions(-) rename tools/testing/selftests/vfio/lib/include/{vfio_util.h => libvfio.h} (98%) diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index 0afbab0d82de..c75045bcab79 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -9,7 +9,7 @@ #include #include -#include +#include #include "registers.h" diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c index c6db311ce64d..a871b935542b 100644 --- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include "hw.h" #include "registers.h" diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/libvfio.h similarity index 98% rename from tools/testing/selftests/vfio/lib/include/vfio_util.h rename to tools/testing/selftests/vfio/lib/include/libvfio.h index 5224808201fe..3027af15e316 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H -#define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H #include #include @@ -352,4 +352,4 @@ void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); void vfio_pci_driver_send_msi(struct vfio_pci_device *device); -#endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c index 3933079fc419..52f9cdf5f171 100644 --- a/tools/testing/selftests/vfio/lib/iommu.c +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -19,7 +19,7 @@ #include #include "../../../kselftest.h" -#include +#include const char *default_iommu_mode = "iommufd"; diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c index b3b6b27f5d1e..a12b0a51e9e6 100644 --- a/tools/testing/selftests/vfio/lib/iova_allocator.c +++ b/tools/testing/selftests/vfio/lib/iova_allocator.c @@ -19,7 +19,7 @@ #include #include -#include +#include struct iova_allocator *iova_allocator_init(struct iommu *iommu) { diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index aa5b45052c77..847e27e1166c 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -20,7 +20,7 @@ #include #include "../../../kselftest.h" -#include +#include #define PCI_SYSFS_PATH "/sys/bus/pci/devices" diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index abb7a62a03ea..ca0e25efbfa1 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include "../../../kselftest.h" -#include +#include #ifdef __x86_64__ extern struct vfio_pci_driver_ops dsa_ops; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index c4c2fc36c7b3..213fcd8dcc79 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "../kselftest_harness.h" diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c index 3655106b912d..caf1c6291f3d 100644 --- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "../kselftest_harness.h" static const char iommu_dev_path[] = "/dev/iommu"; diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index e95217933c6b..ecbb669b3765 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include "../kselftest_harness.h" diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 229e932a06f8..f0ca8310d6a8 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include "../kselftest_harness.h" From 19cf492c1bddc93b8a0144aac5295b7cf2bd24b8 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:30 +0000 Subject: [PATCH 62/65] vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c Move the vfio_selftests_*() helpers into their own file libvfio.c. These helpers have nothing to do with struct vfio_pci_device, so they don't make sense in vfio_pci_device.c. No functional change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-16-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/libvfio.c | 78 +++++++++++++++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 1 + .../selftests/vfio/lib/vfio_pci_device.c | 71 ----------------- 3 files changed, 79 insertions(+), 71 deletions(-) create mode 100644 tools/testing/selftests/vfio/lib/libvfio.c diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c new file mode 100644 index 000000000000..a23a3cc5be69 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include "../../../kselftest.h" +#include + +static bool is_bdf(const char *str) +{ + unsigned int s, b, d, f; + int length, count; + + count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); + return count == 4 && length == strlen(str); +} + +static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs) +{ + int i; + + for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--) + continue; + + i++; + *nr_bdfs = *argc - i; + *argc -= *nr_bdfs; + + return *nr_bdfs ? &argv[i] : NULL; +} + +static char *get_bdf_env(void) +{ + char *bdf; + + bdf = getenv("VFIO_SELFTESTS_BDF"); + if (!bdf) + return NULL; + + VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); + return bdf; +} + +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs) +{ + static char *env_bdf; + char **bdfs; + + bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs); + if (bdfs) + return bdfs; + + env_bdf = get_bdf_env(); + if (env_bdf) { + *nr_bdfs = 1; + return &env_bdf; + } + + fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address via environment variable:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n"); + fprintf(stderr, " %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address(es) via argv:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]); + fprintf(stderr, "\n"); + exit(KSFT_SKIP); +} + +const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +{ + int nr_bdfs; + + return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; +} diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index f15b966877e9..9f47bceed16f 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -5,6 +5,7 @@ LIBVFIO_SRCDIR := $(selfdir)/vfio/lib LIBVFIO_C := iommu.c LIBVFIO_C += iova_allocator.c +LIBVFIO_C += libvfio.c LIBVFIO_C += vfio_pci_device.c LIBVFIO_C += vfio_pci_driver.c diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 847e27e1166c..13fdb4b0b10f 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -376,74 +376,3 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) free(device); } - -static bool is_bdf(const char *str) -{ - unsigned int s, b, d, f; - int length, count; - - count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); - return count == 4 && length == strlen(str); -} - -static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs) -{ - int i; - - for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--) - continue; - - i++; - *nr_bdfs = *argc - i; - *argc -= *nr_bdfs; - - return *nr_bdfs ? &argv[i] : NULL; -} - -static char *get_bdf_env(void) -{ - char *bdf; - - bdf = getenv("VFIO_SELFTESTS_BDF"); - if (!bdf) - return NULL; - - VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); - return bdf; -} - -char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs) -{ - static char *env_bdf; - char **bdfs; - - bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs); - if (bdfs) - return bdfs; - - env_bdf = get_bdf_env(); - if (env_bdf) { - *nr_bdfs = 1; - return &env_bdf; - } - - fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via environment variable:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n"); - fprintf(stderr, " %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address(es) via argv:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]); - fprintf(stderr, "\n"); - exit(KSFT_SKIP); -} - -const char *vfio_selftests_get_bdf(int *argc, char *argv[]) -{ - int nr_bdfs; - - return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; -} From 5fabc49abf7ac630babfb1525e242848d54038c1 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:31 +0000 Subject: [PATCH 63/65] vfio: selftests: Split libvfio.h into separate header files Split out the contents of libvfio.h into separate header files, but keep libvfio.h as the top-level include that all tests can use. Put all new header files into a libvfio/ subdirectory to avoid future name conflicts in include paths when libvfio is used by other selftests like KVM. No functional change intended. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-17-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/libvfio.h | 339 +----------------- .../vfio/lib/include/libvfio/assert.h | 54 +++ .../vfio/lib/include/libvfio/iommu.h | 77 ++++ .../vfio/lib/include/libvfio/iova_allocator.h | 23 ++ .../lib/include/libvfio/vfio_pci_device.h | 125 +++++++ .../lib/include/libvfio/vfio_pci_driver.h | 97 +++++ 6 files changed, 381 insertions(+), 334 deletions(-) create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/assert.h create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iommu.h create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 3027af15e316..279ddcd70194 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -2,210 +2,11 @@ #ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H #define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H -#include -#include - -#include -#include -#include -#include -#include - -#include "../../../kselftest.h" - -#define VFIO_LOG_AND_EXIT(...) do { \ - fprintf(stderr, " " __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(KSFT_FAIL); \ -} while (0) - -#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ - typeof(_lhs) __lhs = (_lhs); \ - typeof(_rhs) __rhs = (_rhs); \ - \ - if (__lhs _op __rhs) \ - break; \ - \ - fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ - fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ - fprintf(stderr, " Observed: %#lx %s %#lx\n", \ - (u64)__lhs, #_op, (u64)__rhs); \ - fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ - VFIO_LOG_AND_EXIT(__VA_ARGS__); \ -} while (0) - -#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) -#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) -#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) -#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) -#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) -#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) -#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) -#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) - -#define VFIO_FAIL(_fmt, ...) do { \ - fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ - VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ -} while (0) - -#define ioctl_assert(_fd, _op, _arg) do { \ - void *__arg = (_arg); \ - int __ret = ioctl((_fd), (_op), (__arg)); \ - VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ -} while (0) - -#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) -#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) - -struct iommu_mode { - const char *name; - const char *container_path; - unsigned long iommu_type; -}; - -/* - * Generator for VFIO selftests fixture variants that replicate across all - * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() - * which should then use FIXTURE_VARIANT_ADD() to create the variant. - */ -#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) - -struct vfio_pci_bar { - struct vfio_region_info info; - void *vaddr; -}; - -typedef u64 iova_t; - -#define INVALID_IOVA UINT64_MAX - -struct dma_region { - struct list_head link; - void *vaddr; - iova_t iova; - u64 size; -}; - -struct vfio_pci_device; - -struct vfio_pci_driver_ops { - const char *name; - - /** - * @probe() - Check if the driver supports the given device. - * - * Return: 0 on success, non-0 on failure. - */ - int (*probe)(struct vfio_pci_device *device); - - /** - * @init() - Initialize the driver for @device. - * - * Must be called after device->driver.region has been initialized. - */ - void (*init)(struct vfio_pci_device *device); - - /** - * remove() - Deinitialize the driver for @device. - */ - void (*remove)(struct vfio_pci_device *device); - - /** - * memcpy_start() - Kick off @count repeated memcpy operations from - * [@src, @src + @size) to [@dst, @dst + @size). - * - * Guarantees: - * - The device will attempt DMA reads on [src, src + size). - * - The device will attempt DMA writes on [dst, dst + size). - * - The device will not generate any interrupts. - * - * memcpy_start() returns immediately, it does not wait for the - * copies to complete. - */ - void (*memcpy_start)(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, u64 count); - - /** - * memcpy_wait() - Wait until the memcpy operations started by - * memcpy_start() have finished. - * - * Guarantees: - * - All in-flight DMAs initiated by memcpy_start() are fully complete - * before memcpy_wait() returns. - * - * Returns non-0 if the driver detects that an error occurred during the - * memcpy, 0 otherwise. - */ - int (*memcpy_wait)(struct vfio_pci_device *device); - - /** - * send_msi() - Make the device send the MSI device->driver.msi. - * - * Guarantees: - * - The device will send the MSI once. - */ - void (*send_msi)(struct vfio_pci_device *device); -}; - -struct vfio_pci_driver { - const struct vfio_pci_driver_ops *ops; - bool initialized; - bool memcpy_in_progress; - - /* Region to be used by the driver (e.g. for in-memory descriptors) */ - struct dma_region region; - - /* The maximum size that can be passed to memcpy_start(). */ - u64 max_memcpy_size; - - /* The maximum count that can be passed to memcpy_start(). */ - u64 max_memcpy_count; - - /* The MSI vector the device will signal in ops->send_msi(). */ - int msi; -}; - -struct iommu { - const struct iommu_mode *mode; - int container_fd; - int iommufd; - u32 ioas_id; - struct list_head dma_regions; -}; - -struct vfio_pci_device { - const char *bdf; - int fd; - int group_fd; - - struct iommu *iommu; - - struct vfio_device_info info; - struct vfio_region_info config_space; - struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; - - struct vfio_irq_info msi_info; - struct vfio_irq_info msix_info; - - /* eventfds for MSI and MSI-x interrupts */ - int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; - - struct vfio_pci_driver driver; -}; - -struct iova_allocator { - struct iommu_iova_range *ranges; - u32 nranges; - u32 range_idx; - u64 range_offset; -}; +#include +#include +#include +#include +#include /* * Return the BDF string of the device that the test should use. @@ -222,134 +23,4 @@ struct iova_allocator { const char *vfio_selftests_get_bdf(int *argc, char *argv[]); char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); -const char *vfio_pci_get_cdev_path(const char *bdf); - -extern const char *default_iommu_mode; - -struct iommu *iommu_init(const char *iommu_mode); -void iommu_cleanup(struct iommu *iommu); - -int __iommu_map(struct iommu *iommu, struct dma_region *region); - -static inline void iommu_map(struct iommu *iommu, struct dma_region *region) -{ - VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0); -} - -int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped); - -static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region) -{ - VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0); -} - -int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped); - -static inline void iommu_unmap_all(struct iommu *iommu) -{ - VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); -} - -iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr); -iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); - -struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); - -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); -void vfio_pci_device_cleanup(struct vfio_pci_device *device); - -void vfio_pci_device_reset(struct vfio_pci_device *device); - -struct iova_allocator *iova_allocator_init(struct iommu *iommu); -void iova_allocator_cleanup(struct iova_allocator *allocator); -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); - -void vfio_pci_config_access(struct vfio_pci_device *device, bool write, - size_t config, size_t size, void *data); - -#define vfio_pci_config_read(_device, _offset, _type) ({ \ - _type __data; \ - vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ - __data; \ -}) - -#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) -#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) -#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) - -#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ - _type __data = (_value); \ - vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ -} while (0) - -#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) -#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) -#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) - -void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, - u32 vector, int count); -void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); -void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); - -static inline void fcntl_set_nonblock(int fd) -{ - int r; - - r = fcntl(fd, F_GETFL, 0); - VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); - - r = fcntl(fd, F_SETFL, r | O_NONBLOCK); - VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); -} - -static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); -} - -static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); -} - -static inline iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) -{ - return __iommu_hva2iova(device->iommu, vaddr); -} - -static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) -{ - return iommu_hva2iova(device->iommu, vaddr); -} - -static inline bool vfio_pci_device_match(struct vfio_pci_device *device, - u16 vendor_id, u16 device_id) -{ - return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && - (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); -} - -void vfio_pci_driver_probe(struct vfio_pci_device *device); -void vfio_pci_driver_init(struct vfio_pci_device *device); -void vfio_pci_driver_remove(struct vfio_pci_device *device); -int vfio_pci_driver_memcpy(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size); -void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, - u64 count); -int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); -void vfio_pci_driver_send_msi(struct vfio_pci_device *device); - #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h new file mode 100644 index 000000000000..f4ebd122d9b6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H + +#include +#include +#include + +#include "../../../../kselftest.h" + +#define VFIO_LOG_AND_EXIT(...) do { \ + fprintf(stderr, " " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(KSFT_FAIL); \ +} while (0) + +#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ + typeof(_lhs) __lhs = (_lhs); \ + typeof(_rhs) __rhs = (_rhs); \ + \ + if (__lhs _op __rhs) \ + break; \ + \ + fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ + fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ + fprintf(stderr, " Observed: %#lx %s %#lx\n", \ + (u64)__lhs, #_op, (u64)__rhs); \ + fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ + VFIO_LOG_AND_EXIT(__VA_ARGS__); \ +} while (0) + +#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) +#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) +#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) +#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) +#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) +#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) +#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) +#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) + +#define VFIO_FAIL(_fmt, ...) do { \ + fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ + VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ +} while (0) + +#define ioctl_assert(_fd, _op, _arg) do { \ + void *__arg = (_arg); \ + int __ret = ioctl((_fd), (_op), (__arg)); \ + VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ +} while (0) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h new file mode 100644 index 000000000000..e35f13ed3f3c --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H + +#include +#include + +#include + +typedef u64 iova_t; +#define INVALID_IOVA UINT64_MAX + +struct iommu_mode { + const char *name; + const char *container_path; + unsigned long iommu_type; +}; + +extern const char *default_iommu_mode; + +struct dma_region { + struct list_head link; + void *vaddr; + iova_t iova; + u64 size; +}; + +struct iommu { + const struct iommu_mode *mode; + int container_fd; + int iommufd; + u32 ioas_id; + struct list_head dma_regions; +}; + +struct iommu *iommu_init(const char *iommu_mode); +void iommu_cleanup(struct iommu *iommu); + +int __iommu_map(struct iommu *iommu, struct dma_region *region); + +static inline void iommu_map(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped); + +static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0); +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped); + +static inline void iommu_unmap_all(struct iommu *iommu) +{ + VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); +} + +iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr); +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); + +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); + +/* + * Generator for VFIO selftests fixture variants that replicate across all + * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() + * which should then use FIXTURE_VARIANT_ADD() to create the variant. + */ +#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h new file mode 100644 index 000000000000..8f1d994e9ea2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H + +#include +#include +#include +#include + +#include + +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + +struct iova_allocator *iova_allocator_init(struct iommu *iommu); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h new file mode 100644 index 000000000000..160e003131d6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H + +#include +#include +#include + +#include +#include +#include + +struct vfio_pci_bar { + struct vfio_region_info info; + void *vaddr; +}; + +struct vfio_pci_device { + const char *bdf; + int fd; + int group_fd; + + struct iommu *iommu; + + struct vfio_device_info info; + struct vfio_region_info config_space; + struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; + + struct vfio_irq_info msi_info; + struct vfio_irq_info msix_info; + + /* eventfds for MSI and MSI-x interrupts */ + int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; + + struct vfio_pci_driver driver; +}; + +#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); +void vfio_pci_device_cleanup(struct vfio_pci_device *device); + +void vfio_pci_device_reset(struct vfio_pci_device *device); + +void vfio_pci_config_access(struct vfio_pci_device *device, bool write, + size_t config, size_t size, void *data); + +#define vfio_pci_config_read(_device, _offset, _type) ({ \ + _type __data; \ + vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ + __data; \ +}) + +#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) +#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) +#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) + +#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ + _type __data = (_value); \ + vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ +} while (0) + +#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) +#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) +#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) + +void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, + u32 vector, int count); +void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); +void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); + +static inline void fcntl_set_nonblock(int fd) +{ + int r; + + r = fcntl(fd, F_GETFL, 0); + VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); + + r = fcntl(fd, F_SETFL, r | O_NONBLOCK); + VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); +} + +static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); +} + +static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); +} + +static inline iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return __iommu_hva2iova(device->iommu, vaddr); +} + +static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return iommu_hva2iova(device->iommu, vaddr); +} + +static inline bool vfio_pci_device_match(struct vfio_pci_device *device, + u16 vendor_id, u16 device_id) +{ + return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && + (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); +} + +const char *vfio_pci_get_cdev_path(const char *bdf); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h new file mode 100644 index 000000000000..e5ada209b1d1 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H + +#include + +struct vfio_pci_device; + +struct vfio_pci_driver_ops { + const char *name; + + /** + * @probe() - Check if the driver supports the given device. + * + * Return: 0 on success, non-0 on failure. + */ + int (*probe)(struct vfio_pci_device *device); + + /** + * @init() - Initialize the driver for @device. + * + * Must be called after device->driver.region has been initialized. + */ + void (*init)(struct vfio_pci_device *device); + + /** + * remove() - Deinitialize the driver for @device. + */ + void (*remove)(struct vfio_pci_device *device); + + /** + * memcpy_start() - Kick off @count repeated memcpy operations from + * [@src, @src + @size) to [@dst, @dst + @size). + * + * Guarantees: + * - The device will attempt DMA reads on [src, src + size). + * - The device will attempt DMA writes on [dst, dst + size). + * - The device will not generate any interrupts. + * + * memcpy_start() returns immediately, it does not wait for the + * copies to complete. + */ + void (*memcpy_start)(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count); + + /** + * memcpy_wait() - Wait until the memcpy operations started by + * memcpy_start() have finished. + * + * Guarantees: + * - All in-flight DMAs initiated by memcpy_start() are fully complete + * before memcpy_wait() returns. + * + * Returns non-0 if the driver detects that an error occurred during the + * memcpy, 0 otherwise. + */ + int (*memcpy_wait)(struct vfio_pci_device *device); + + /** + * send_msi() - Make the device send the MSI device->driver.msi. + * + * Guarantees: + * - The device will send the MSI once. + */ + void (*send_msi)(struct vfio_pci_device *device); +}; + +struct vfio_pci_driver { + const struct vfio_pci_driver_ops *ops; + bool initialized; + bool memcpy_in_progress; + + /* Region to be used by the driver (e.g. for in-memory descriptors) */ + struct dma_region region; + + /* The maximum size that can be passed to memcpy_start(). */ + u64 max_memcpy_size; + + /* The maximum count that can be passed to memcpy_start(). */ + u64 max_memcpy_count; + + /* The MSI vector the device will signal in ops->send_msi(). */ + int msi; +}; + +void vfio_pci_driver_probe(struct vfio_pci_device *device); +void vfio_pci_driver_init(struct vfio_pci_device *device); +void vfio_pci_driver_remove(struct vfio_pci_device *device); +int vfio_pci_driver_memcpy(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size); +void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count); +int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); +void vfio_pci_driver_send_msi(struct vfio_pci_device *device); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H */ From b8e96c8805ec25878ac971cad34732a6aabe34f6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:32 +0000 Subject: [PATCH 64/65] vfio: selftests: Eliminate INVALID_IOVA Eliminate INVALID_IOVA as there are platforms where UINT64_MAX is a valid iova. Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-18-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/libvfio/iommu.h | 3 +-- .../vfio/lib/include/libvfio/vfio_pci_device.h | 4 ++-- tools/testing/selftests/vfio/lib/iommu.c | 14 +++++++++----- .../testing/selftests/vfio/vfio_dma_mapping_test.c | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h index e35f13ed3f3c..5c9b9dc6d993 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -8,7 +8,6 @@ #include typedef u64 iova_t; -#define INVALID_IOVA UINT64_MAX struct iommu_mode { const char *name; @@ -57,7 +56,7 @@ static inline void iommu_unmap_all(struct iommu *iommu) VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); } -iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr); +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova); iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h index 160e003131d6..2858885a89bb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -103,9 +103,9 @@ static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); } -static inline iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) +static inline int __to_iova(struct vfio_pci_device *device, void *vaddr, iova_t *iova) { - return __iommu_hva2iova(device->iommu, vaddr); + return __iommu_hva2iova(device->iommu, vaddr, iova); } static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c index 52f9cdf5f171..8079d43523f3 100644 --- a/tools/testing/selftests/vfio/lib/iommu.c +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -67,7 +67,7 @@ static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); } -iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr) +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova) { struct dma_region *region; @@ -78,18 +78,22 @@ iova_t __iommu_hva2iova(struct iommu *iommu, void *vaddr) if (vaddr >= region->vaddr + region->size) continue; - return region->iova + (vaddr - region->vaddr); + if (iova) + *iova = region->iova + (vaddr - region->vaddr); + + return 0; } - return INVALID_IOVA; + return -ENOENT; } iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr) { iova_t iova; + int ret; - iova = __iommu_hva2iova(iommu, vaddr); - VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into IOMMU\n", vaddr); + ret = __iommu_hva2iova(iommu, vaddr, &iova); + VFIO_ASSERT_EQ(ret, 0, "%p is not mapped into the iommu\n", vaddr); return iova; } diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 213fcd8dcc79..5397822c3dd4 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -199,7 +199,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region.size); printf("Unmapped IOVA 0x%lx\n", region.iova); - ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr)); + ASSERT_NE(0, __to_iova(self->device, region.vaddr, NULL)); ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping)); ASSERT_TRUE(!munmap(region.vaddr, size)); From d721f52e31553a848e0e9947ca15a49c5674aef3 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 Nov 2025 23:17:33 +0000 Subject: [PATCH 65/65] vfio: selftests: Add vfio_pci_device_init_perf_test Add a new VFIO selftest for measuring the time it takes to run vfio_pci_device_init() in parallel for one or more devices. This test serves as manual regression test for the performance improvement of commit e908f58b6beb ("vfio/pci: Separate SR-IOV VF dev_set"). For example, when running this test with 64 VFs under the same PF: Before: $ ./vfio_pci_device_init_perf_test -r vfio_pci_device_init_perf_test.iommufd.init 0000:1a:00.0 0000:1a:00.1 ... ... Wall time: 6.653234463s Min init time (per device): 0.101215344s Max init time (per device): 6.652755941s Avg init time (per device): 3.377609608s After: $ ./vfio_pci_device_init_perf_test -r vfio_pci_device_init_perf_test.iommufd.init 0000:1a:00.0 0000:1a:00.1 ... ... Wall time: 0.122978332s Min init time (per device): 0.108121915s Max init time (per device): 0.122762761s Avg init time (per device): 0.113816748s This test does not make any assertions about performance, since any such assertion is likely to be flaky due to system differences and random noise. However this test can be fed into automation to detect regressions, and can be used by developers in the future to measure performance optimizations. Suggested-by: Aaron Lewis Reviewed-by: Alex Mastro Tested-by: Alex Mastro Reviewed-by: Raghavendra Rao Ananta Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20251126231733.3302983-19-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 3 + .../vfio/vfio_pci_device_init_perf_test.c | 168 ++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index a29f99395206..3c796ca99a50 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -2,6 +2,7 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test +TEST_GEN_PROGS += vfio_pci_device_init_perf_test TEST_GEN_PROGS += vfio_pci_driver_test TEST_FILES += scripts/cleanup.sh @@ -16,6 +17,8 @@ CFLAGS += -I$(top_srcdir)/tools/include CFLAGS += -MD CFLAGS += $(EXTRA_CFLAGS) +LDFLAGS += -pthread + $(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c new file mode 100644 index 000000000000..33b0c31fe2ed --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include +#include +#include + +#include + +#include "../kselftest_harness.h" + +static char **device_bdfs; +static int nr_devices; + +struct thread_args { + struct iommu *iommu; + int device_index; + struct timespec start; + struct timespec end; + pthread_barrier_t *barrier; +}; + +FIXTURE(vfio_pci_device_init_perf_test) { + pthread_t *threads; + pthread_barrier_t barrier; + struct thread_args *thread_args; + struct iommu *iommu; +}; + +FIXTURE_VARIANT(vfio_pci_device_init_perf_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_pci_device_init_perf_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +FIXTURE_SETUP(vfio_pci_device_init_perf_test) +{ + int i; + + self->iommu = iommu_init(variant->iommu_mode); + self->threads = calloc(nr_devices, sizeof(self->threads[0])); + self->thread_args = calloc(nr_devices, sizeof(self->thread_args[0])); + + pthread_barrier_init(&self->barrier, NULL, nr_devices); + + for (i = 0; i < nr_devices; i++) { + self->thread_args[i].iommu = self->iommu; + self->thread_args[i].barrier = &self->barrier; + self->thread_args[i].device_index = i; + } +} + +FIXTURE_TEARDOWN(vfio_pci_device_init_perf_test) +{ + iommu_cleanup(self->iommu); + free(self->threads); + free(self->thread_args); +} + +static s64 to_ns(struct timespec ts) +{ + return (s64)ts.tv_nsec + NSEC_PER_SEC * (s64)ts.tv_sec; +} + +static struct timespec to_timespec(s64 ns) +{ + struct timespec ts = { + .tv_nsec = ns % NSEC_PER_SEC, + .tv_sec = ns / NSEC_PER_SEC, + }; + + return ts; +} + +static struct timespec timespec_sub(struct timespec a, struct timespec b) +{ + return to_timespec(to_ns(a) - to_ns(b)); +} + +static struct timespec timespec_min(struct timespec a, struct timespec b) +{ + return to_ns(a) < to_ns(b) ? a : b; +} + +static struct timespec timespec_max(struct timespec a, struct timespec b) +{ + return to_ns(a) > to_ns(b) ? a : b; +} + +static void *thread_main(void *__args) +{ + struct thread_args *args = __args; + struct vfio_pci_device *device; + + pthread_barrier_wait(args->barrier); + + clock_gettime(CLOCK_MONOTONIC, &args->start); + device = vfio_pci_device_init(device_bdfs[args->device_index], args->iommu); + clock_gettime(CLOCK_MONOTONIC, &args->end); + + pthread_barrier_wait(args->barrier); + + vfio_pci_device_cleanup(device); + return NULL; +} + +TEST_F(vfio_pci_device_init_perf_test, init) +{ + struct timespec start = to_timespec(INT64_MAX), end = {}; + struct timespec min = to_timespec(INT64_MAX); + struct timespec max = {}; + struct timespec avg = {}; + struct timespec wall_time; + s64 thread_ns = 0; + int i; + + for (i = 0; i < nr_devices; i++) { + pthread_create(&self->threads[i], NULL, thread_main, + &self->thread_args[i]); + } + + for (i = 0; i < nr_devices; i++) { + struct thread_args *args = &self->thread_args[i]; + struct timespec init_time; + + pthread_join(self->threads[i], NULL); + + start = timespec_min(start, args->start); + end = timespec_max(end, args->end); + + init_time = timespec_sub(args->end, args->start); + min = timespec_min(min, init_time); + max = timespec_max(max, init_time); + thread_ns += to_ns(init_time); + } + + avg = to_timespec(thread_ns / nr_devices); + wall_time = timespec_sub(end, start); + + printf("Wall time: %lu.%09lus\n", + wall_time.tv_sec, wall_time.tv_nsec); + printf("Min init time (per device): %lu.%09lus\n", + min.tv_sec, min.tv_nsec); + printf("Max init time (per device): %lu.%09lus\n", + max.tv_sec, max.tv_nsec); + printf("Avg init time (per device): %lu.%09lus\n", + avg.tv_sec, avg.tv_nsec); +} + +int main(int argc, char *argv[]) +{ + int i; + + device_bdfs = vfio_selftests_get_bdfs(&argc, argv, &nr_devices); + + printf("Testing parallel initialization of %d devices:\n", nr_devices); + for (i = 0; i < nr_devices; i++) + printf(" %s\n", device_bdfs[i]); + + return test_harness_run(argc, argv); +}