diff --git a/MAINTAINERS b/MAINTAINERS index 5a4aac3206fa..551dc764a50d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27022,6 +27022,13 @@ L: virtualization@lists.linux.dev S: Maintained F: drivers/vfio/pci/virtio +VFIO XE PCI DRIVER +M: Michał Winiarski +L: kvm@vger.kernel.org +L: intel-xe@lists.freedesktop.org +S: Supported +F: drivers/vfio/pci/xe + VGA_SWITCHEROO R: Lukas Wunner S: Maintained diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index e4b273b025d2..a1f86a59d687 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -184,6 +184,10 @@ xe-$(CONFIG_PCI_IOV) += \ xe_sriov_pf_sysfs.o \ xe_tile_sriov_pf_debugfs.o +ifeq ($(CONFIG_PCI_IOV),y) + xe-$(CONFIG_XE_VFIO_PCI) += xe_sriov_vfio.o +endif + # include helpers for tests even when XE is built-in ifdef CONFIG_DRM_XE_KUNIT_TEST xe-y += tests/xe_kunit_helpers.o diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c index d5d918ddce4f..3174a8dee779 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -17,6 +17,7 @@ #include "xe_gt_sriov_pf_helpers.h" #include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_printk.h" +#include "xe_guc.h" #include "xe_guc_buf.h" #include "xe_guc_ct.h" #include "xe_migrate.h" @@ -1023,6 +1024,12 @@ static void action_ring_cleanup(void *arg) ptr_ring_cleanup(r, destroy_pf_packet); } +static void pf_gt_migration_check_support(struct xe_gt *gt) +{ + if (GUC_FIRMWARE_VER(>->uc.guc) < MAKE_GUC_VER(70, 54, 0)) + xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0"); +} + /** * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. * @gt: the &xe_gt @@ -1039,6 +1046,8 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_PF(xe)); + pf_gt_migration_check_support(gt); + if (!pf_migration_supported(gt)) return 0; diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index bbe6f8e65844..0e39a0255b99 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -1223,6 +1223,23 @@ static struct pci_driver xe_pci_driver = { #endif }; +/** + * xe_pci_to_pf_device() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev) +{ + struct drm_device *drm; + + drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver); + if (IS_ERR(drm)) + return NULL; + + return to_xe_device(drm); +} + int xe_register_pci_driver(void) { return pci_register_driver(&xe_pci_driver); diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h index 611c1209b14c..11bcc5fe2c5b 100644 --- a/drivers/gpu/drm/xe/xe_pci.h +++ b/drivers/gpu/drm/xe/xe_pci.h @@ -6,7 +6,10 @@ #ifndef _XE_PCI_H_ #define _XE_PCI_H_ +struct pci_dev; + int xe_register_pci_driver(void); void xe_unregister_pci_driver(void); +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev); #endif diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index 44924512830f..766922530265 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -726,6 +726,13 @@ static void xe_pm_runtime_lockdep_prime(void) /** * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously * @xe: xe device instance + * + * When possible, scope-based runtime PM (through guard(xe_pm_runtime)) is + * be preferred over direct usage of this function. Manual get/put handling + * should only be used when the function contains goto-based logic which + * can break scope-based handling, or when the lifetime of the runtime PM + * reference does not match a specific scope (e.g., runtime PM obtained in one + * function and released in a different one). */ void xe_pm_runtime_get(struct xe_device *xe) { @@ -758,6 +765,13 @@ void xe_pm_runtime_put(struct xe_device *xe) * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl * @xe: xe device instance * + * When possible, scope-based runtime PM (through + * ACQUIRE(xe_pm_runtime_ioctl, ...)) is be preferred over direct usage of this + * function. Manual get/put handling should only be used when the function + * contains goto-based logic which can break scope-based handling, or when the + * lifetime of the runtime PM reference does not match a specific scope (e.g., + * runtime PM obtained in one function and released in a different one). + * * Returns: Any number greater than or equal to 0 for success, negative error * code otherwise. */ @@ -827,6 +841,13 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe) * It will warn if not protected. * The reference should be put back after this function regardless, since it * will always bump the usage counter, regardless. + * + * When possible, scope-based runtime PM (through guard(xe_pm_runtime_noresume)) + * is be preferred over direct usage of this function. Manual get/put handling + * should only be used when the function contains goto-based logic which can + * break scope-based handling, or when the lifetime of the runtime PM reference + * does not match a specific scope (e.g., runtime PM obtained in one function + * and released in a different one). */ void xe_pm_runtime_get_noresume(struct xe_device *xe) { diff --git a/drivers/gpu/drm/xe/xe_pm.h b/drivers/gpu/drm/xe/xe_pm.h index f7f89a18b6fc..6b27039e7b2d 100644 --- a/drivers/gpu/drm/xe/xe_pm.h +++ b/drivers/gpu/drm/xe/xe_pm.h @@ -6,6 +6,7 @@ #ifndef _XE_PM_H_ #define _XE_PM_H_ +#include #include #define DEFAULT_VRAM_THRESHOLD 300 /* in MB */ @@ -37,4 +38,20 @@ int xe_pm_block_on_suspend(struct xe_device *xe); void xe_pm_might_block_on_suspend(void); int xe_pm_module_init(void); +static inline void __xe_pm_runtime_noop(struct xe_device *xe) {} + +DEFINE_GUARD(xe_pm_runtime, struct xe_device *, + xe_pm_runtime_get(_T), xe_pm_runtime_put(_T)) +DEFINE_GUARD(xe_pm_runtime_noresume, struct xe_device *, + xe_pm_runtime_get_noresume(_T), xe_pm_runtime_put(_T)) +DEFINE_GUARD_COND(xe_pm_runtime, _ioctl, xe_pm_runtime_get_ioctl(_T), _RET >= 0) + +/* + * Used when a function needs to release runtime PM in all possible cases + * and error paths, but the wakeref was already acquired by a different + * function (i.e., get() has already happened so only a put() is needed). + */ +DEFINE_GUARD(xe_pm_runtime_release_only, struct xe_device *, + __xe_pm_runtime_noop(_T), xe_pm_runtime_put(_T)); + #endif diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c index de06cc690fc8..6c4b16409cc9 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c @@ -46,13 +46,37 @@ bool xe_sriov_pf_migration_supported(struct xe_device *xe) { xe_assert(xe, IS_SRIOV_PF(xe)); - return xe->sriov.pf.migration.supported; + return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled; } -static bool pf_check_migration_support(struct xe_device *xe) +/** + * xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF. + * @xe: the &xe_device instance. + * @fmt: format string for the log message, to be combined with following VAs. + */ +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...) { - /* XXX: for now this is for feature enabling only */ - return IS_ENABLED(CONFIG_DRM_XE_DEBUG); + struct va_format vaf; + va_list va_args; + + xe_assert(xe, IS_SRIOV_PF(xe)); + + va_start(va_args, fmt); + vaf.fmt = fmt; + vaf.va = &va_args; + xe_sriov_notice(xe, "migration %s: %pV\n", + IS_ENABLED(CONFIG_DRM_XE_DEBUG) ? + "missing prerequisite" : "disabled", + &vaf); + va_end(va_args); + + xe->sriov.pf.migration.disabled = true; +} + +static void pf_migration_check_support(struct xe_device *xe) +{ + if (!xe_device_has_memirq(xe)) + xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support"); } static void pf_migration_cleanup(void *arg) @@ -77,7 +101,8 @@ int xe_sriov_pf_migration_init(struct xe_device *xe) xe_assert(xe, IS_SRIOV_PF(xe)); - xe->sriov.pf.migration.supported = pf_check_migration_support(xe); + pf_migration_check_support(xe); + if (!xe_sriov_pf_migration_supported(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h index b806298a0bb6..f8f408df8481 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h @@ -14,6 +14,7 @@ struct xe_sriov_packet; int xe_sriov_pf_migration_init(struct xe_device *xe); bool xe_sriov_pf_migration_supported(struct xe_device *xe); +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...); int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid, struct xe_sriov_packet *data); struct xe_sriov_packet * diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h index 363d673ee1dd..7d9a8a278d91 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h @@ -14,8 +14,8 @@ * struct xe_sriov_pf_migration - Xe device level VF migration data */ struct xe_sriov_pf_migration { - /** @supported: indicates whether VF migration feature is supported */ - bool supported; + /** @disabled: indicates whether VF migration feature is disabled */ + bool disabled; }; /** diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c new file mode 100644 index 000000000000..e9a7615bb5c5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include +#include + +#include "xe_pci.h" +#include "xe_pm.h" +#include "xe_sriov_pf_control.h" +#include "xe_sriov_pf_helpers.h" +#include "xe_sriov_pf_migration.h" + +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev) +{ + return xe_pci_to_pf_device(pdev); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci"); + +bool xe_sriov_vfio_migration_supported(struct xe_device *xe) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + + return xe_sriov_pf_migration_supported(xe); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci"); + +#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \ +_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \ +{ \ + if (!IS_SRIOV_PF(xe)) \ + return -EPERM; \ + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \ + return -EINVAL; \ + \ + guard(xe_pm_runtime_noresume)(xe); \ + \ + return xe_sriov_pf_##_impl(xe, vfid); \ +} \ +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci") + +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size); + +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_read(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci"); + +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_write(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci"); diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..c100f0ab87f2 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig" source "drivers/vfio/pci/qat/Kconfig" +source "drivers/vfio/pci/xe/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..f5d46aa9347b 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -19,3 +19,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ obj-$(CONFIG_QAT_VFIO_PCI) += qat/ + +obj-$(CONFIG_XE_VFIO_PCI) += xe/ diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig new file mode 100644 index 000000000000..cc9b6dac6ed3 --- /dev/null +++ b/drivers/vfio/pci/xe/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config XE_VFIO_PCI + tristate "VFIO support for Intel Graphics" + depends on DRM_XE && PCI_IOV + select VFIO_PCI_CORE + help + This option enables device specific VFIO driver variant for Intel Graphics. + In addition to generic VFIO PCI functionality, it implements VFIO + migration uAPI allowing userspace to enable migration for + Intel Graphics SR-IOV Virtual Functions supported by the Xe driver. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile new file mode 100644 index 000000000000..13aa0fd192cd --- /dev/null +++ b/drivers/vfio/pci/xe/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o +xe-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c new file mode 100644 index 000000000000..0156b53c678b --- /dev/null +++ b/drivers/vfio/pci/xe/main.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2025 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct xe_vfio_pci_migration_file { + struct file *filp; + /* serializes accesses to migration data */ + struct mutex lock; + struct xe_vfio_pci_core_device *xe_vdev; + u8 disabled:1; +}; + +struct xe_vfio_pci_core_device { + struct vfio_pci_core_device core_device; + struct xe_device *xe; + /* PF internal control uses vfid index starting from 1 */ + unsigned int vfid; + u8 deferred_reset:1; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protects the reset_done flow */ + spinlock_t reset_lock; + struct xe_vfio_pci_migration_file *migf; +}; + +#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) + +static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + mutex_unlock(&migf->lock); +} + +static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) +{ + xe_vfio_pci_disable_file(xe_vdev->migf); + fput(xe_vdev->migf->filp); + xe_vdev->migf = NULL; +} + +static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; +} + +static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) +{ + mutex_lock(&xe_vdev->state_mutex); +} + +/* + * This function is called in all state_mutex unlock cases to + * handle a 'deferred_reset' if exists. + */ +static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) +{ +again: + spin_lock(&xe_vdev->reset_lock); + if (xe_vdev->deferred_reset) { + xe_vdev->deferred_reset = false; + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_reset(xe_vdev); + goto again; + } + mutex_unlock(&xe_vdev->state_mutex); + spin_unlock(&xe_vdev->reset_lock); +} + +static void xe_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + int ret; + + if (!pdev->is_virtfn) + return; + + /* + * VF FLR requires additional processing done by PF driver. + * The processing is done after FLR is already finished from PCIe + * perspective. + * In order to avoid a scenario where VF is used while PF processing + * is still in progress, additional synchronization point is needed. + */ + ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); + if (ret) + dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); + + if (!xe_vdev->vfid) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&xe_vdev->reset_lock); + xe_vdev->deferred_reset = true; + if (!mutex_trylock(&xe_vdev->state_mutex)) { + spin_unlock(&xe_vdev->reset_lock); + return; + } + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + xe_vfio_pci_reset(xe_vdev); +} + +static const struct pci_error_handlers xe_vfio_pci_err_handlers = { + .reset_done = xe_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &xe_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + xe_vfio_pci_reset(xe_vdev); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_save_fops = { + .owner = THIS_MODULE, + .read = xe_vfio_pci_save_read, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_resume_fops = { + .owner = THIS_MODULE, + .write = xe_vfio_pci_resume_write, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static const char *vfio_dev_state_str(u32 state) +{ + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: return "running"; + case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; + case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; + case VFIO_DEVICE_STATE_STOP: return "stop"; + case VFIO_DEVICE_STATE_RESUMING: return "resuming"; + case VFIO_DEVICE_STATE_ERROR: return "error"; + default: return ""; + } +} + +enum xe_vfio_pci_file_type { + XE_VFIO_FILE_SAVE = 0, + XE_VFIO_FILE_RESUME, +}; + +static struct xe_vfio_pci_migration_file * +xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, + enum xe_vfio_pci_file_type type) +{ + struct xe_vfio_pci_migration_file *migf; + const struct file_operations *fops; + int flags; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; + flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; + migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); + if (IS_ERR(migf->filp)) { + kfree(migf); + return ERR_CAST(migf->filp); + } + + mutex_init(&migf->lock); + migf->xe_vdev = xe_vdev; + xe_vdev->migf = migf; + + stream_open(migf->filp->f_inode, migf->filp); + + return migf; +} + +static struct file * +xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) +{ + u32 cur = xe_vdev->mig_state; + int ret; + + dev_dbg(xe_vdev_to_dev(xe_vdev), + "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); + + /* + * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't + * have the capability to selectively block outgoing p2p DMA transfers. + * While the device is allowing BAR accesses when the VF is stopped, it + * is not processing any new workload requests, effectively stopping + * any outgoing DMA transfers (not just p2p). + * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and + * will be migrated to target VF during stop-copy. + */ + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + WARN(true, "Unknown state transition %d->%d", cur, new); + return ERR_PTR(-EINVAL); + +err: + dev_dbg(xe_vdev_to_dev(xe_vdev), + "Failed to transition state: %s->%s err=%d\n", + vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); + return ERR_PTR(ret); +} + +static struct file * +xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state new_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *f = NULL; + int ret; + + xe_vfio_pci_state_mutex_lock(xe_vdev); + while (new_state != xe_vdev->mig_state) { + ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, + new_state, &next_state); + if (ret) { + xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); + f = ERR_PTR(ret); + break; + } + f = xe_vfio_set_state(xe_vdev, next_state); + if (IS_ERR(f)) + break; + + xe_vdev->mig_state = next_state; + + /* Multiple state transitions with non-NULL file in the middle */ + if (f && new_state != xe_vdev->mig_state) { + fput(f); + f = ERR_PTR(-EINVAL); + break; + } + } + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return f; +} + +static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state *curr_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *curr_state = xe_vdev->mig_state; + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { + .migration_set_state = xe_vfio_pci_set_device_state, + .migration_get_state = xe_vfio_pci_get_device_state, + .migration_get_data_size = xe_vfio_pci_get_data_size, +}; + +static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) +{ + struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); + + if (!xe) + return; + if (!xe_sriov_vfio_migration_supported(xe)) + return; + + mutex_init(&xe_vdev->state_mutex); + spin_lock_init(&xe_vdev->reset_lock); + + /* PF internal control uses vfid index starting from 1 */ + xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; + xe_vdev->xe = xe; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + core_vdev->mig_ops = &xe_vfio_pci_migration_ops; +} + +static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (!xe_vdev->vfid) + return; + + mutex_destroy(&xe_vdev->state_mutex); +} + +static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_init(xe_vdev); + + return vfio_pci_core_init_dev(core_vdev); +} + +static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_fini(xe_vdev); +} + +static const struct vfio_device_ops xe_vfio_pci_ops = { + .name = "xe-vfio-pci", + .init = xe_vfio_pci_init_dev, + .release = xe_vfio_pci_release_dev, + .open_device = xe_vfio_pci_open_device, + .close_device = xe_vfio_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct xe_vfio_pci_core_device *xe_vdev; + int ret; + + xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, + &xe_vfio_pci_ops); + if (IS_ERR(xe_vdev)) + return PTR_ERR(xe_vdev); + + dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); + + ret = vfio_pci_core_register_device(&xe_vdev->core_device); + if (ret) { + vfio_put_device(&xe_vdev->core_device.vdev); + return ret; + } + + return 0; +} + +static void xe_vfio_pci_remove(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + + vfio_pci_core_unregister_device(&xe_vdev->core_device); + vfio_put_device(&xe_vdev->core_device.vdev); +} + +#define INTEL_PCI_VFIO_DEVICE(_id) { \ + PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ +} + +static const struct pci_device_id xe_vfio_pci_table[] = { + INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), + {} +}; +MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); + +static struct pci_driver xe_vfio_pci_driver = { + .name = "xe-vfio-pci", + .id_table = xe_vfio_pci_table, + .probe = xe_vfio_pci_probe, + .remove = xe_vfio_pci_remove, + .err_handler = &xe_vfio_pci_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(xe_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michał Winiarski "); +MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); diff --git a/include/drm/intel/xe_sriov_vfio.h b/include/drm/intel/xe_sriov_vfio.h new file mode 100644 index 000000000000..e9814e8149fd --- /dev/null +++ b/include/drm/intel/xe_sriov_vfio.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_SRIOV_VFIO_H_ +#define _XE_SRIOV_VFIO_H_ + +#include + +struct pci_dev; +struct xe_device; + +/** + * xe_sriov_vfio_get_pf() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev); + +/** + * xe_sriov_vfio_migration_supported() - Check if migration is supported. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * + * Return: true if migration is supported, false otherwise. + */ +bool xe_sriov_vfio_migration_supported(struct xe_device *xe); + +/** + * xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will wait until VF FLR is processed by PF on all tiles (or + * until timeout occurs). + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_suspend_device() - Suspend VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will pause VF on all tiles/GTs. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_device() - Resume VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will resume VF on all tiles. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_error() - Move VF device to error state. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Reset is needed to move it out of error state. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_data_read() - Read migration data from the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested read size from userspace + * + * Return: number of bytes that has been successfully read, + * 0 if no more migration data is available, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len); +/** + * xe_sriov_vfio_data_write() - Write migration data to the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested write size from userspace + * + * Return: number of bytes that has been successfully written, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len); +/** + * xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: migration data size in bytes or a negative error code on failure. + */ +ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid); + +#endif