Cross-subsystem Changes:

- Add device specific vfio_pci driver variant for intel graphics (Michal Winiarski)
 
 Driver Changes:
 - Add scope-based cleanup helper for runtime PM (Matt Roper)
 - Additional xe driver prerequisites and exports (Michal Winiarski)
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRskUM7w1oG5rx2IZO4FpNVCsYGvwUCaS1ZrwAKCRC4FpNVCsYG
 v9iHAQCPfipm3pC3SykJCKe0Ve+gWfglHg4uuwaRLcvlO+fZsQD/T/enYeR+6omX
 S+owV2xgoVY0cEqYqPsEeOwp4Nj7BQs=
 =0Ouw
 -----END PGP SIGNATURE-----

Merge tag 'topic/xe-vfio-2025-12-01' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-next

Cross-subsystem Changes:
- Add device specific vfio_pci driver variant for intel graphics (Michal Winiarski)

Driver Changes:
- Add scope-based cleanup helper for runtime PM (Matt Roper)
- Additional xe driver prerequisites and exports (Michal Winiarski)

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Thomas Hellstrom <thomas.hellstrom@linux.intel.com>
Link: https://patch.msgid.link/aS1bNpqeem6PIHrA@fedora
This commit is contained in:
Dave Airlie 2025-12-05 10:16:14 +10:00
commit 55a271a0f7
17 changed files with 926 additions and 7 deletions

View File

@ -27022,6 +27022,13 @@ L: virtualization@lists.linux.dev
S: Maintained S: Maintained
F: drivers/vfio/pci/virtio F: drivers/vfio/pci/virtio
VFIO XE PCI DRIVER
M: Michał Winiarski <michal.winiarski@intel.com>
L: kvm@vger.kernel.org
L: intel-xe@lists.freedesktop.org
S: Supported
F: drivers/vfio/pci/xe
VGA_SWITCHEROO VGA_SWITCHEROO
R: Lukas Wunner <lukas@wunner.de> R: Lukas Wunner <lukas@wunner.de>
S: Maintained S: Maintained

View File

@ -184,6 +184,10 @@ xe-$(CONFIG_PCI_IOV) += \
xe_sriov_pf_sysfs.o \ xe_sriov_pf_sysfs.o \
xe_tile_sriov_pf_debugfs.o xe_tile_sriov_pf_debugfs.o
ifeq ($(CONFIG_PCI_IOV),y)
xe-$(CONFIG_XE_VFIO_PCI) += xe_sriov_vfio.o
endif
# include helpers for tests even when XE is built-in # include helpers for tests even when XE is built-in
ifdef CONFIG_DRM_XE_KUNIT_TEST ifdef CONFIG_DRM_XE_KUNIT_TEST
xe-y += tests/xe_kunit_helpers.o xe-y += tests/xe_kunit_helpers.o

View File

@ -17,6 +17,7 @@
#include "xe_gt_sriov_pf_helpers.h" #include "xe_gt_sriov_pf_helpers.h"
#include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_pf_migration.h"
#include "xe_gt_sriov_printk.h" #include "xe_gt_sriov_printk.h"
#include "xe_guc.h"
#include "xe_guc_buf.h" #include "xe_guc_buf.h"
#include "xe_guc_ct.h" #include "xe_guc_ct.h"
#include "xe_migrate.h" #include "xe_migrate.h"
@ -1023,6 +1024,12 @@ static void action_ring_cleanup(void *arg)
ptr_ring_cleanup(r, destroy_pf_packet); ptr_ring_cleanup(r, destroy_pf_packet);
} }
static void pf_gt_migration_check_support(struct xe_gt *gt)
{
if (GUC_FIRMWARE_VER(&gt->uc.guc) < MAKE_GUC_VER(70, 54, 0))
xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0");
}
/** /**
* xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration.
* @gt: the &xe_gt * @gt: the &xe_gt
@ -1039,6 +1046,8 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt)
xe_gt_assert(gt, IS_SRIOV_PF(xe)); xe_gt_assert(gt, IS_SRIOV_PF(xe));
pf_gt_migration_check_support(gt);
if (!pf_migration_supported(gt)) if (!pf_migration_supported(gt))
return 0; return 0;

View File

@ -1223,6 +1223,23 @@ static struct pci_driver xe_pci_driver = {
#endif #endif
}; };
/**
* xe_pci_to_pf_device() - Get PF &xe_device.
* @pdev: the VF &pci_dev device
*
* Return: pointer to PF &xe_device, NULL otherwise.
*/
struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev)
{
struct drm_device *drm;
drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver);
if (IS_ERR(drm))
return NULL;
return to_xe_device(drm);
}
int xe_register_pci_driver(void) int xe_register_pci_driver(void)
{ {
return pci_register_driver(&xe_pci_driver); return pci_register_driver(&xe_pci_driver);

View File

@ -6,7 +6,10 @@
#ifndef _XE_PCI_H_ #ifndef _XE_PCI_H_
#define _XE_PCI_H_ #define _XE_PCI_H_
struct pci_dev;
int xe_register_pci_driver(void); int xe_register_pci_driver(void);
void xe_unregister_pci_driver(void); void xe_unregister_pci_driver(void);
struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev);
#endif #endif

View File

@ -726,6 +726,13 @@ static void xe_pm_runtime_lockdep_prime(void)
/** /**
* xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously
* @xe: xe device instance * @xe: xe device instance
*
* When possible, scope-based runtime PM (through guard(xe_pm_runtime)) is
* be preferred over direct usage of this function. Manual get/put handling
* should only be used when the function contains goto-based logic which
* can break scope-based handling, or when the lifetime of the runtime PM
* reference does not match a specific scope (e.g., runtime PM obtained in one
* function and released in a different one).
*/ */
void xe_pm_runtime_get(struct xe_device *xe) void xe_pm_runtime_get(struct xe_device *xe)
{ {
@ -758,6 +765,13 @@ void xe_pm_runtime_put(struct xe_device *xe)
* xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl
* @xe: xe device instance * @xe: xe device instance
* *
* When possible, scope-based runtime PM (through
* ACQUIRE(xe_pm_runtime_ioctl, ...)) is be preferred over direct usage of this
* function. Manual get/put handling should only be used when the function
* contains goto-based logic which can break scope-based handling, or when the
* lifetime of the runtime PM reference does not match a specific scope (e.g.,
* runtime PM obtained in one function and released in a different one).
*
* Returns: Any number greater than or equal to 0 for success, negative error * Returns: Any number greater than or equal to 0 for success, negative error
* code otherwise. * code otherwise.
*/ */
@ -827,6 +841,13 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe)
* It will warn if not protected. * It will warn if not protected.
* The reference should be put back after this function regardless, since it * The reference should be put back after this function regardless, since it
* will always bump the usage counter, regardless. * will always bump the usage counter, regardless.
*
* When possible, scope-based runtime PM (through guard(xe_pm_runtime_noresume))
* is be preferred over direct usage of this function. Manual get/put handling
* should only be used when the function contains goto-based logic which can
* break scope-based handling, or when the lifetime of the runtime PM reference
* does not match a specific scope (e.g., runtime PM obtained in one function
* and released in a different one).
*/ */
void xe_pm_runtime_get_noresume(struct xe_device *xe) void xe_pm_runtime_get_noresume(struct xe_device *xe)
{ {

View File

@ -6,6 +6,7 @@
#ifndef _XE_PM_H_ #ifndef _XE_PM_H_
#define _XE_PM_H_ #define _XE_PM_H_
#include <linux/cleanup.h>
#include <linux/pm_runtime.h> #include <linux/pm_runtime.h>
#define DEFAULT_VRAM_THRESHOLD 300 /* in MB */ #define DEFAULT_VRAM_THRESHOLD 300 /* in MB */
@ -37,4 +38,20 @@ int xe_pm_block_on_suspend(struct xe_device *xe);
void xe_pm_might_block_on_suspend(void); void xe_pm_might_block_on_suspend(void);
int xe_pm_module_init(void); int xe_pm_module_init(void);
static inline void __xe_pm_runtime_noop(struct xe_device *xe) {}
DEFINE_GUARD(xe_pm_runtime, struct xe_device *,
xe_pm_runtime_get(_T), xe_pm_runtime_put(_T))
DEFINE_GUARD(xe_pm_runtime_noresume, struct xe_device *,
xe_pm_runtime_get_noresume(_T), xe_pm_runtime_put(_T))
DEFINE_GUARD_COND(xe_pm_runtime, _ioctl, xe_pm_runtime_get_ioctl(_T), _RET >= 0)
/*
* Used when a function needs to release runtime PM in all possible cases
* and error paths, but the wakeref was already acquired by a different
* function (i.e., get() has already happened so only a put() is needed).
*/
DEFINE_GUARD(xe_pm_runtime_release_only, struct xe_device *,
__xe_pm_runtime_noop(_T), xe_pm_runtime_put(_T));
#endif #endif

View File

@ -46,13 +46,37 @@ bool xe_sriov_pf_migration_supported(struct xe_device *xe)
{ {
xe_assert(xe, IS_SRIOV_PF(xe)); xe_assert(xe, IS_SRIOV_PF(xe));
return xe->sriov.pf.migration.supported; return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled;
} }
static bool pf_check_migration_support(struct xe_device *xe) /**
* xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF.
* @xe: the &xe_device instance.
* @fmt: format string for the log message, to be combined with following VAs.
*/
void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...)
{ {
/* XXX: for now this is for feature enabling only */ struct va_format vaf;
return IS_ENABLED(CONFIG_DRM_XE_DEBUG); va_list va_args;
xe_assert(xe, IS_SRIOV_PF(xe));
va_start(va_args, fmt);
vaf.fmt = fmt;
vaf.va = &va_args;
xe_sriov_notice(xe, "migration %s: %pV\n",
IS_ENABLED(CONFIG_DRM_XE_DEBUG) ?
"missing prerequisite" : "disabled",
&vaf);
va_end(va_args);
xe->sriov.pf.migration.disabled = true;
}
static void pf_migration_check_support(struct xe_device *xe)
{
if (!xe_device_has_memirq(xe))
xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support");
} }
static void pf_migration_cleanup(void *arg) static void pf_migration_cleanup(void *arg)
@ -77,7 +101,8 @@ int xe_sriov_pf_migration_init(struct xe_device *xe)
xe_assert(xe, IS_SRIOV_PF(xe)); xe_assert(xe, IS_SRIOV_PF(xe));
xe->sriov.pf.migration.supported = pf_check_migration_support(xe); pf_migration_check_support(xe);
if (!xe_sriov_pf_migration_supported(xe)) if (!xe_sriov_pf_migration_supported(xe))
return 0; return 0;

View File

@ -14,6 +14,7 @@ struct xe_sriov_packet;
int xe_sriov_pf_migration_init(struct xe_device *xe); int xe_sriov_pf_migration_init(struct xe_device *xe);
bool xe_sriov_pf_migration_supported(struct xe_device *xe); bool xe_sriov_pf_migration_supported(struct xe_device *xe);
void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...);
int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid, int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid,
struct xe_sriov_packet *data); struct xe_sriov_packet *data);
struct xe_sriov_packet * struct xe_sriov_packet *

View File

@ -14,8 +14,8 @@
* struct xe_sriov_pf_migration - Xe device level VF migration data * struct xe_sriov_pf_migration - Xe device level VF migration data
*/ */
struct xe_sriov_pf_migration { struct xe_sriov_pf_migration {
/** @supported: indicates whether VF migration feature is supported */ /** @disabled: indicates whether VF migration feature is disabled */
bool supported; bool disabled;
}; };
/** /**

View File

@ -0,0 +1,80 @@
// SPDX-License-Identifier: MIT
/*
* Copyright © 2025 Intel Corporation
*/
#include <drm/intel/xe_sriov_vfio.h>
#include <linux/cleanup.h>
#include "xe_pci.h"
#include "xe_pm.h"
#include "xe_sriov_pf_control.h"
#include "xe_sriov_pf_helpers.h"
#include "xe_sriov_pf_migration.h"
struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev)
{
return xe_pci_to_pf_device(pdev);
}
EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci");
bool xe_sriov_vfio_migration_supported(struct xe_device *xe)
{
if (!IS_SRIOV_PF(xe))
return -EPERM;
return xe_sriov_pf_migration_supported(xe);
}
EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci");
#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \
_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \
{ \
if (!IS_SRIOV_PF(xe)) \
return -EPERM; \
if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \
return -EINVAL; \
\
guard(xe_pm_runtime_noresume)(xe); \
\
return xe_sriov_pf_##_impl(xe, vfid); \
} \
EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci")
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf);
DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size);
ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid,
char __user *buf, size_t len)
{
if (!IS_SRIOV_PF(xe))
return -EPERM;
if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
return -EINVAL;
guard(xe_pm_runtime_noresume)(xe);
return xe_sriov_pf_migration_read(xe, vfid, buf, len);
}
EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci");
ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid,
const char __user *buf, size_t len)
{
if (!IS_SRIOV_PF(xe))
return -EPERM;
if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
return -EINVAL;
guard(xe_pm_runtime_noresume)(xe);
return xe_sriov_pf_migration_write(xe, vfid, buf, len);
}
EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci");

View File

@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
source "drivers/vfio/pci/qat/Kconfig" source "drivers/vfio/pci/qat/Kconfig"
source "drivers/vfio/pci/xe/Kconfig"
endmenu endmenu

View File

@ -19,3 +19,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
obj-$(CONFIG_QAT_VFIO_PCI) += qat/ obj-$(CONFIG_QAT_VFIO_PCI) += qat/
obj-$(CONFIG_XE_VFIO_PCI) += xe/

View File

@ -0,0 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
config XE_VFIO_PCI
tristate "VFIO support for Intel Graphics"
depends on DRM_XE && PCI_IOV
select VFIO_PCI_CORE
help
This option enables device specific VFIO driver variant for Intel Graphics.
In addition to generic VFIO PCI functionality, it implements VFIO
migration uAPI allowing userspace to enable migration for
Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.
If you don't know what to do here, say N.

View File

@ -0,0 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o
xe-vfio-pci-y := main.o

573
drivers/vfio/pci/xe/main.c Normal file
View File

@ -0,0 +1,573 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright © 2025 Intel Corporation
*/
#include <linux/anon_inodes.h>
#include <linux/delay.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/sizes.h>
#include <linux/types.h>
#include <linux/vfio.h>
#include <linux/vfio_pci_core.h>
#include <drm/intel/xe_sriov_vfio.h>
#include <drm/intel/pciids.h>
struct xe_vfio_pci_migration_file {
struct file *filp;
/* serializes accesses to migration data */
struct mutex lock;
struct xe_vfio_pci_core_device *xe_vdev;
u8 disabled:1;
};
struct xe_vfio_pci_core_device {
struct vfio_pci_core_device core_device;
struct xe_device *xe;
/* PF internal control uses vfid index starting from 1 */
unsigned int vfid;
u8 deferred_reset:1;
/* protects migration state */
struct mutex state_mutex;
enum vfio_device_mig_state mig_state;
/* protects the reset_done flow */
spinlock_t reset_lock;
struct xe_vfio_pci_migration_file *migf;
};
#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
{
mutex_lock(&migf->lock);
migf->disabled = true;
mutex_unlock(&migf->lock);
}
static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev)
{
xe_vfio_pci_disable_file(xe_vdev->migf);
fput(xe_vdev->migf->filp);
xe_vdev->migf = NULL;
}
static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
{
if (xe_vdev->migf)
xe_vfio_pci_put_file(xe_vdev);
xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
}
static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
{
mutex_lock(&xe_vdev->state_mutex);
}
/*
* This function is called in all state_mutex unlock cases to
* handle a 'deferred_reset' if exists.
*/
static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
{
again:
spin_lock(&xe_vdev->reset_lock);
if (xe_vdev->deferred_reset) {
xe_vdev->deferred_reset = false;
spin_unlock(&xe_vdev->reset_lock);
xe_vfio_pci_reset(xe_vdev);
goto again;
}
mutex_unlock(&xe_vdev->state_mutex);
spin_unlock(&xe_vdev->reset_lock);
}
static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
{
struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
int ret;
if (!pdev->is_virtfn)
return;
/*
* VF FLR requires additional processing done by PF driver.
* The processing is done after FLR is already finished from PCIe
* perspective.
* In order to avoid a scenario where VF is used while PF processing
* is still in progress, additional synchronization point is needed.
*/
ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
if (ret)
dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
if (!xe_vdev->vfid)
return;
/*
* As the higher VFIO layers are holding locks across reset and using
* those same locks with the mm_lock we need to prevent ABBA deadlock
* with the state_mutex and mm_lock.
* In case the state_mutex was taken already we defer the cleanup work
* to the unlock flow of the other running context.
*/
spin_lock(&xe_vdev->reset_lock);
xe_vdev->deferred_reset = true;
if (!mutex_trylock(&xe_vdev->state_mutex)) {
spin_unlock(&xe_vdev->reset_lock);
return;
}
spin_unlock(&xe_vdev->reset_lock);
xe_vfio_pci_state_mutex_unlock(xe_vdev);
xe_vfio_pci_reset(xe_vdev);
}
static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
.reset_done = xe_vfio_pci_reset_done,
.error_detected = vfio_pci_core_aer_err_detected,
};
static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
int ret;
ret = vfio_pci_core_enable(vdev);
if (ret)
return ret;
xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(vdev);
return 0;
}
static void xe_vfio_pci_close_device(struct vfio_device *core_vdev)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
xe_vfio_pci_state_mutex_lock(xe_vdev);
xe_vfio_pci_reset(xe_vdev);
xe_vfio_pci_state_mutex_unlock(xe_vdev);
vfio_pci_core_close_device(core_vdev);
}
static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
{
struct xe_vfio_pci_migration_file *migf = filp->private_data;
mutex_destroy(&migf->lock);
kfree(migf);
return 0;
}
static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
{
struct xe_vfio_pci_migration_file *migf = filp->private_data;
ssize_t ret;
if (pos)
return -ESPIPE;
mutex_lock(&migf->lock);
if (migf->disabled) {
mutex_unlock(&migf->lock);
return -ENODEV;
}
ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
mutex_unlock(&migf->lock);
return ret;
}
static const struct file_operations xe_vfio_pci_save_fops = {
.owner = THIS_MODULE,
.read = xe_vfio_pci_save_read,
.release = xe_vfio_pci_release_file,
.llseek = noop_llseek,
};
static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct xe_vfio_pci_migration_file *migf = filp->private_data;
ssize_t ret;
if (pos)
return -ESPIPE;
mutex_lock(&migf->lock);
if (migf->disabled) {
mutex_unlock(&migf->lock);
return -ENODEV;
}
ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
mutex_unlock(&migf->lock);
return ret;
}
static const struct file_operations xe_vfio_pci_resume_fops = {
.owner = THIS_MODULE,
.write = xe_vfio_pci_resume_write,
.release = xe_vfio_pci_release_file,
.llseek = noop_llseek,
};
static const char *vfio_dev_state_str(u32 state)
{
switch (state) {
case VFIO_DEVICE_STATE_RUNNING: return "running";
case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
case VFIO_DEVICE_STATE_STOP: return "stop";
case VFIO_DEVICE_STATE_RESUMING: return "resuming";
case VFIO_DEVICE_STATE_ERROR: return "error";
default: return "";
}
}
enum xe_vfio_pci_file_type {
XE_VFIO_FILE_SAVE = 0,
XE_VFIO_FILE_RESUME,
};
static struct xe_vfio_pci_migration_file *
xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
enum xe_vfio_pci_file_type type)
{
struct xe_vfio_pci_migration_file *migf;
const struct file_operations *fops;
int flags;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
if (IS_ERR(migf->filp)) {
kfree(migf);
return ERR_CAST(migf->filp);
}
mutex_init(&migf->lock);
migf->xe_vdev = xe_vdev;
xe_vdev->migf = migf;
stream_open(migf->filp->f_inode, migf->filp);
return migf;
}
static struct file *
xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
{
u32 cur = xe_vdev->mig_state;
int ret;
dev_dbg(xe_vdev_to_dev(xe_vdev),
"state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
/*
* "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
* have the capability to selectively block outgoing p2p DMA transfers.
* While the device is allowing BAR accesses when the VF is stopped, it
* is not processing any new workload requests, effectively stopping
* any outgoing DMA transfers (not just p2p).
* Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
* will be migrated to target VF during stop-copy.
*/
if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
if (ret)
goto err;
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
(cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
return NULL;
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
if (ret)
goto err;
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct xe_vfio_pci_migration_file *migf;
migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
if (IS_ERR(migf)) {
ret = PTR_ERR(migf);
goto err;
}
get_file(migf->filp);
ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
if (ret) {
fput(migf->filp);
goto err;
}
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
if (xe_vdev->migf)
xe_vfio_pci_put_file(xe_vdev);
ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
if (ret)
goto err;
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
struct xe_vfio_pci_migration_file *migf;
migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
if (IS_ERR(migf)) {
ret = PTR_ERR(migf);
goto err;
}
get_file(migf->filp);
ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
if (ret) {
fput(migf->filp);
goto err;
}
return migf->filp;
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
if (xe_vdev->migf)
xe_vfio_pci_put_file(xe_vdev);
ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
if (ret)
goto err;
return NULL;
}
WARN(true, "Unknown state transition %d->%d", cur, new);
return ERR_PTR(-EINVAL);
err:
dev_dbg(xe_vdev_to_dev(xe_vdev),
"Failed to transition state: %s->%s err=%d\n",
vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
return ERR_PTR(ret);
}
static struct file *
xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
enum vfio_device_mig_state new_state)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
enum vfio_device_mig_state next_state;
struct file *f = NULL;
int ret;
xe_vfio_pci_state_mutex_lock(xe_vdev);
while (new_state != xe_vdev->mig_state) {
ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
new_state, &next_state);
if (ret) {
xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
f = ERR_PTR(ret);
break;
}
f = xe_vfio_set_state(xe_vdev, next_state);
if (IS_ERR(f))
break;
xe_vdev->mig_state = next_state;
/* Multiple state transitions with non-NULL file in the middle */
if (f && new_state != xe_vdev->mig_state) {
fput(f);
f = ERR_PTR(-EINVAL);
break;
}
}
xe_vfio_pci_state_mutex_unlock(xe_vdev);
return f;
}
static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
enum vfio_device_mig_state *curr_state)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
xe_vfio_pci_state_mutex_lock(xe_vdev);
*curr_state = xe_vdev->mig_state;
xe_vfio_pci_state_mutex_unlock(xe_vdev);
return 0;
}
static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
unsigned long *stop_copy_length)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
xe_vfio_pci_state_mutex_lock(xe_vdev);
*stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
xe_vfio_pci_state_mutex_unlock(xe_vdev);
return 0;
}
static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
.migration_set_state = xe_vfio_pci_set_device_state,
.migration_get_state = xe_vfio_pci_get_device_state,
.migration_get_data_size = xe_vfio_pci_get_data_size,
};
static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
{
struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
if (!xe)
return;
if (!xe_sriov_vfio_migration_supported(xe))
return;
mutex_init(&xe_vdev->state_mutex);
spin_lock_init(&xe_vdev->reset_lock);
/* PF internal control uses vfid index starting from 1 */
xe_vdev->vfid = pci_iov_vf_id(pdev) + 1;
xe_vdev->xe = xe;
core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
}
static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
{
if (!xe_vdev->vfid)
return;
mutex_destroy(&xe_vdev->state_mutex);
}
static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
xe_vfio_pci_migration_init(xe_vdev);
return vfio_pci_core_init_dev(core_vdev);
}
static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
{
struct xe_vfio_pci_core_device *xe_vdev =
container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
xe_vfio_pci_migration_fini(xe_vdev);
}
static const struct vfio_device_ops xe_vfio_pci_ops = {
.name = "xe-vfio-pci",
.init = xe_vfio_pci_init_dev,
.release = xe_vfio_pci_release_dev,
.open_device = xe_vfio_pci_open_device,
.close_device = xe_vfio_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
.match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct xe_vfio_pci_core_device *xe_vdev;
int ret;
xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
&xe_vfio_pci_ops);
if (IS_ERR(xe_vdev))
return PTR_ERR(xe_vdev);
dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
ret = vfio_pci_core_register_device(&xe_vdev->core_device);
if (ret) {
vfio_put_device(&xe_vdev->core_device.vdev);
return ret;
}
return 0;
}
static void xe_vfio_pci_remove(struct pci_dev *pdev)
{
struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
vfio_pci_core_unregister_device(&xe_vdev->core_device);
vfio_put_device(&xe_vdev->core_device.vdev);
}
#define INTEL_PCI_VFIO_DEVICE(_id) { \
PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
}
static const struct pci_device_id xe_vfio_pci_table[] = {
INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
{}
};
MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
static struct pci_driver xe_vfio_pci_driver = {
.name = "xe-vfio-pci",
.id_table = xe_vfio_pci_table,
.probe = xe_vfio_pci_probe,
.remove = xe_vfio_pci_remove,
.err_handler = &xe_vfio_pci_err_handlers,
.driver_managed_dma = true,
};
module_pci_driver(xe_vfio_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>");
MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");

View File

@ -0,0 +1,143 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2025 Intel Corporation
*/
#ifndef _XE_SRIOV_VFIO_H_
#define _XE_SRIOV_VFIO_H_
#include <linux/types.h>
struct pci_dev;
struct xe_device;
/**
* xe_sriov_vfio_get_pf() - Get PF &xe_device.
* @pdev: the VF &pci_dev device
*
* Return: pointer to PF &xe_device, NULL otherwise.
*/
struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev);
/**
* xe_sriov_vfio_migration_supported() - Check if migration is supported.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
*
* Return: true if migration is supported, false otherwise.
*/
bool xe_sriov_vfio_migration_supported(struct xe_device *xe);
/**
* xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* This function will wait until VF FLR is processed by PF on all tiles (or
* until timeout occurs).
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_suspend_device() - Suspend VF.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* This function will pause VF on all tiles/GTs.
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_resume_device() - Resume VF.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* This function will resume VF on all tiles.
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_error() - Move VF device to error state.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Reset is needed to move it out of error state.
*
* Return: 0 on success or a negative error code on failure.
*/
int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid);
/**
* xe_sriov_vfio_data_read() - Read migration data from the VF device.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
* @buf: start address of userspace buffer
* @len: requested read size from userspace
*
* Return: number of bytes that has been successfully read,
* 0 if no more migration data is available, -errno on failure.
*/
ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid,
char __user *buf, size_t len);
/**
* xe_sriov_vfio_data_write() - Write migration data to the VF device.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
* @buf: start address of userspace buffer
* @len: requested write size from userspace
*
* Return: number of bytes that has been successfully written, -errno on failure.
*/
ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid,
const char __user *buf, size_t len);
/**
* xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data.
* @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf()
* @vfid: the VF identifier (can't be 0)
*
* Return: migration data size in bytes or a negative error code on failure.
*/
ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid);
#endif