iommu/pages: Add support for incoherent IOMMU page table walkers

Some IOMMU HW cannot snoop the CPU cache when it walks the IO page tables.
The CPU is required to flush the cache to make changes visible to the HW.

Provide some helpers from iommu-pages to manage this. The helpers combine
both the ARM and x86 (used in Intel VT-d) versions of the cache flushing
under a single API.

The ARM version uses the DMA API to access the cache flush on the
assumption that the iommu is using a direct mapping and is already marked
incoherent. The helpers will do the DMA API calls to set things up and
keep track of DMA mapped folios using a bit in the ioptdesc so that
unmapping on error paths is cleaner.

The Intel version just calls the arch cache flush call directly and has no
need to cleanup prior to destruction.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
This commit is contained in:
Jason Gunthorpe 2025-10-23 15:22:29 -03:00 committed by Joerg Roedel
parent bc5233c090
commit 36ae67b139
2 changed files with 183 additions and 2 deletions

View File

@ -4,6 +4,7 @@
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include "iommu-pages.h"
#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/mm.h>
@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data);
#undef IOPTDESC_MATCH
static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
{
return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
}
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
*/
void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
{
struct ioptdesc *iopt;
unsigned long pgcnt;
struct folio *folio;
unsigned int order;
@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
if (unlikely(!folio))
return NULL;
iopt = folio_ioptdesc(folio);
iopt->incoherent = false;
/*
* All page allocations that should be reported to as "iommu-pagetables"
* to userspace must use one of the functions below. This includes
@ -82,6 +92,9 @@ static void __iommu_free_desc(struct ioptdesc *iopt)
struct folio *folio = ioptdesc_folio(iopt);
const unsigned long pgcnt = folio_nr_pages(folio);
if (IOMMU_PAGES_USE_DMA_API)
WARN_ON_ONCE(iopt->incoherent);
mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
folio_put(folio);
@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
/**
* iommu_pages_start_incoherent - Setup the page for cache incoherent operation
* @virt: The page to setup
* @dma_dev: The iommu device
*
* For incoherent memory this will use the DMA API to manage the cache flushing
* on some arches. This is a lot of complexity compared to just calling
* arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
* have been doing. The DMA API requires keeping track of the DMA map and
* freeing it when required. This keeps track of the dma map inside the ioptdesc
* so that error paths are simple for the caller.
*/
int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
{
struct ioptdesc *iopt = virt_to_ioptdesc(virt);
dma_addr_t dma;
if (WARN_ON(iopt->incoherent))
return -EINVAL;
if (!IOMMU_PAGES_USE_DMA_API) {
iommu_pages_flush_incoherent(dma_dev, virt, 0,
ioptdesc_mem_size(iopt));
} else {
dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
DMA_TO_DEVICE);
if (dma_mapping_error(dma_dev, dma))
return -EINVAL;
/*
* The DMA API is not allowed to do anything other than DMA
* direct. It would be nice to also check
* dev_is_dma_coherent(dma_dev));
*/
if (WARN_ON(dma != virt_to_phys(virt))) {
dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
DMA_TO_DEVICE);
return -EOPNOTSUPP;
}
}
iopt->incoherent = 1;
return 0;
}
EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
/**
* iommu_pages_start_incoherent_list - Make a list of pages incoherent
* @list: The list of pages to setup
* @dma_dev: The iommu device
*
* Perform iommu_pages_start_incoherent() across all of list.
*
* If this fails the caller must call iommu_pages_stop_incoherent_list().
*/
int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
struct device *dma_dev)
{
struct ioptdesc *cur;
int ret;
list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
if (WARN_ON(cur->incoherent))
continue;
ret = iommu_pages_start_incoherent(
folio_address(ioptdesc_folio(cur)), dma_dev);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
/**
* iommu_pages_stop_incoherent_list - Undo incoherence across a list
* @list: The list of pages to release
* @dma_dev: The iommu device
*
* Revert iommu_pages_start_incoherent() across all of the list. Pages that did
* not call or succeed iommu_pages_start_incoherent() will be ignored.
*/
#if IOMMU_PAGES_USE_DMA_API
void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
struct device *dma_dev)
{
struct ioptdesc *cur;
list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
struct folio *folio = ioptdesc_folio(cur);
if (!cur->incoherent)
continue;
dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
ioptdesc_mem_size(cur), DMA_TO_DEVICE);
cur->incoherent = 0;
}
}
EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
/**
* iommu_pages_free_incoherent - Free an incoherent page
* @virt: virtual address of the page to be freed.
* @dma_dev: The iommu device
*
* If the page is incoherent it made coherent again then freed.
*/
void iommu_pages_free_incoherent(void *virt, struct device *dma_dev)
{
struct ioptdesc *iopt = virt_to_ioptdesc(virt);
if (iopt->incoherent) {
dma_unmap_single(dma_dev, virt_to_phys(virt),
ioptdesc_mem_size(iopt), DMA_TO_DEVICE);
iopt->incoherent = 0;
}
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent);
#endif

View File

@ -21,7 +21,10 @@ struct ioptdesc {
struct list_head iopt_freelist_elm;
unsigned long __page_mapping;
pgoff_t __index;
union {
u8 incoherent;
pgoff_t __index;
};
void *_private;
unsigned int __page_type;
@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
#endif /* __IOMMU_PAGES_H */
int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
struct device *dma_dev);
#ifdef CONFIG_X86
#define IOMMU_PAGES_USE_DMA_API 0
#include <linux/cacheflush.h>
static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
void *virt, size_t offset,
size_t len)
{
clflush_cache_range(virt + offset, len);
}
static inline void
iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
struct device *dma_dev)
{
/*
* For performance leave the incoherent flag alone which turns this into
* a NOP. For X86 the rest of the stop/free flow ignores the flag.
*/
}
static inline void iommu_pages_free_incoherent(void *virt,
struct device *dma_dev)
{
iommu_free_pages(virt);
}
#else
#define IOMMU_PAGES_USE_DMA_API 1
#include <linux/dma-mapping.h>
static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
void *virt, size_t offset,
size_t len)
{
dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
DMA_TO_DEVICE);
}
void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
struct device *dma_dev);
void iommu_pages_free_incoherent(void *virt, struct device *dma_dev);
#endif
#endif /* __IOMMU_PAGES_H */