mirror of https://github.com/torvalds/linux.git
iommu/vt-d: Use the generic iommu page table
Replace the VT-d iommu_domain implementation of the VT-d second stage and
first stage page tables with the iommupt VTDSS and x86_64
pagetables. x86_64 is shared with the AMD driver.
There are a couple notable things in VT-d:
- Like AMD the second stage format is not sign extended, unlike AMD it
cannot decode a full 64 bits. The first stage format is a normal sign
extended x86 page table
- The HW caps can indicate how many levels, how many address bits and what
leaf page sizes are supported in HW. As before the highest number of
levels that can translate the entire supported address width is used.
The supported page sizes are adjusted directly from the dedicated
first/second stage cap bits.
- VTD requires flushing 'write buffers'. This logic is left unchanged,
the write buffer flushes on any gather flush or through iotlb_sync_map.
- Like ARM, VTD has an optional non-coherent page table walker that
requires cache flushing. This is supported through PT_FEAT_DMA_INCOHERENT
the same as ARM, however x86 can't use the DMA API for flush, it must
call the arch function clflush_cache_range()
- The PT_FEAT_DYNAMIC_TOP can probably be supported on VT-d someday for the
second stage when it uses 128 bit atomic stores for the HW context
structures.
- PT_FEAT_VTDSS_FORCE_WRITEABLE is used to work around ERRATA_772415_SPR17
- A kernel command line parameter "sp_off" disables all page sizes except
4k
Remove all the unused iommu_domain page table code. The debugfs paths have
their own independent page table walker that is left alone for now.
This corrects a race with the non-coherent walker that the ARM
implementations have fixed:
CPU 0 CPU 1
pfn_to_dma_pte() pfn_to_dma_pte()
pte = &parent[offset];
if (!dma_pte_present(pte)) {
try_cmpxchg64(&pte->val)
pte = &parent[offset];
.. dma_pte_present(pte) ..
[...]
// iommu_map() completes
// Device does DMA
domain_flush_cache(pte)
The CPU 1 mapping operation shares a page table level with the CPU 0
mapping operation. CPU 0 installed a new page table level but has not
flushed it yet. CPU1 returns from iommu_map() and the device does DMA. The
non coherent walker fails to see the new table level installed by CPU 0
and fails the DMA with non-present.
The iommupt PT_FEAT_DMA_INCOHERENT implementation uses the ARM design of
storing a flag when CPU 0 completes the flush. If the flag is not set CPU
1 will also flush to ensure the HW can fully walk to the PTE being
installed.
Cc: Tina Zhang <tina.zhang@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
This commit is contained in:
parent
ef7bfe5bbf
commit
d373449d8e
|
|
@ -13,6 +13,10 @@ config INTEL_IOMMU
|
|||
bool "Support for Intel IOMMU using DMA Remapping Devices"
|
||||
depends on PCI_MSI && ACPI && X86
|
||||
select IOMMU_API
|
||||
select GENERIC_PT
|
||||
select IOMMU_PT
|
||||
select IOMMU_PT_X86_64
|
||||
select IOMMU_PT_VTDSS
|
||||
select IOMMU_IOVA
|
||||
select IOMMU_IOPF
|
||||
select IOMMUFD_DRIVER if IOMMUFD
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -23,8 +23,8 @@
|
|||
#include <linux/xarray.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/generic_pt/iommu.h>
|
||||
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/iommu.h>
|
||||
#include <uapi/linux/iommufd.h>
|
||||
|
||||
|
|
@ -595,22 +595,20 @@ struct qi_batch {
|
|||
};
|
||||
|
||||
struct dmar_domain {
|
||||
int nid; /* node id */
|
||||
union {
|
||||
struct iommu_domain domain;
|
||||
struct pt_iommu iommu;
|
||||
/* First stage page table */
|
||||
struct pt_iommu_x86_64 fspt;
|
||||
/* Second stage page table */
|
||||
struct pt_iommu_vtdss sspt;
|
||||
};
|
||||
|
||||
struct xarray iommu_array; /* Attached IOMMU array */
|
||||
|
||||
u8 iommu_coherency: 1; /* indicate coherency of iommu access */
|
||||
u8 force_snooping : 1; /* Create IOPTEs with snoop control */
|
||||
u8 set_pte_snp:1;
|
||||
u8 use_first_level:1; /* DMA translation for the domain goes
|
||||
* through the first level page table,
|
||||
* otherwise, goes through the second
|
||||
* level.
|
||||
*/
|
||||
u8 force_snooping:1; /* Create PASID entry with snoop control */
|
||||
u8 dirty_tracking:1; /* Dirty tracking is enabled */
|
||||
u8 nested_parent:1; /* Has other domains nested on it */
|
||||
u8 has_mappings:1; /* Has mappings configured through
|
||||
* iommu_map() interface.
|
||||
*/
|
||||
u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
|
||||
* buffer when creating mappings.
|
||||
*/
|
||||
|
|
@ -623,26 +621,9 @@ struct dmar_domain {
|
|||
struct list_head cache_tags; /* Cache tag list */
|
||||
struct qi_batch *qi_batch; /* Batched QI descriptors */
|
||||
|
||||
int iommu_superpage;/* Level of superpages supported:
|
||||
0 == 4KiB (no superpages), 1 == 2MiB,
|
||||
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
|
||||
union {
|
||||
/* DMA remapping domain */
|
||||
struct {
|
||||
/* virtual address */
|
||||
struct dma_pte *pgd;
|
||||
/* max guest address width */
|
||||
int gaw;
|
||||
/*
|
||||
* adjusted guest address width:
|
||||
* 0: level 2 30-bit
|
||||
* 1: level 3 39-bit
|
||||
* 2: level 4 48-bit
|
||||
* 3: level 5 57-bit
|
||||
*/
|
||||
int agaw;
|
||||
/* maximum mapped address */
|
||||
u64 max_addr;
|
||||
/* Protect the s1_domains list */
|
||||
spinlock_t s1_lock;
|
||||
/* Track s1_domains nested on this domain */
|
||||
|
|
@ -664,10 +645,10 @@ struct dmar_domain {
|
|||
struct mmu_notifier notifier;
|
||||
};
|
||||
};
|
||||
|
||||
struct iommu_domain domain; /* generic domain data structure for
|
||||
iommu core */
|
||||
};
|
||||
PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
|
||||
PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
|
||||
PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
|
||||
|
||||
/*
|
||||
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
|
||||
|
|
@ -866,11 +847,6 @@ struct dma_pte {
|
|||
u64 val;
|
||||
};
|
||||
|
||||
static inline void dma_clear_pte(struct dma_pte *pte)
|
||||
{
|
||||
pte->val = 0;
|
||||
}
|
||||
|
||||
static inline u64 dma_pte_addr(struct dma_pte *pte)
|
||||
{
|
||||
#ifdef CONFIG_64BIT
|
||||
|
|
@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
|
|||
return (pte->val & 3) != 0;
|
||||
}
|
||||
|
||||
static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
|
||||
unsigned long flags)
|
||||
{
|
||||
if (flags & IOMMU_DIRTY_NO_CLEAR)
|
||||
return (pte->val & DMA_SL_PTE_DIRTY) != 0;
|
||||
|
||||
return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
|
||||
(unsigned long *)&pte->val);
|
||||
}
|
||||
|
||||
static inline bool dma_pte_superpage(struct dma_pte *pte)
|
||||
{
|
||||
return (pte->val & DMA_PTE_LARGE_PAGE);
|
||||
}
|
||||
|
||||
static inline bool first_pte_in_page(struct dma_pte *pte)
|
||||
{
|
||||
return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
|
||||
}
|
||||
|
||||
static inline int nr_pte_to_next_page(struct dma_pte *pte)
|
||||
{
|
||||
return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
|
||||
(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
|
||||
}
|
||||
|
||||
static inline bool context_present(struct context_entry *context)
|
||||
{
|
||||
return (context->lo & 1);
|
||||
|
|
@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw)
|
|||
return agaw + 2;
|
||||
}
|
||||
|
||||
static inline int agaw_to_width(int agaw)
|
||||
{
|
||||
return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
|
||||
}
|
||||
|
||||
static inline int width_to_agaw(int width)
|
||||
{
|
||||
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
|
||||
|
|
@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
|
|||
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
|
||||
}
|
||||
|
||||
static inline u64 level_mask(int level)
|
||||
{
|
||||
return -1ULL << level_to_offset_bits(level);
|
||||
}
|
||||
|
||||
static inline u64 level_size(int level)
|
||||
{
|
||||
return 1ULL << level_to_offset_bits(level);
|
||||
}
|
||||
|
||||
static inline u64 align_to_level(u64 pfn, int level)
|
||||
{
|
||||
return (pfn + level_size(level) - 1) & level_mask(level);
|
||||
}
|
||||
|
||||
static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
|
||||
{
|
||||
return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
|
||||
}
|
||||
|
||||
static inline void context_set_present(struct context_entry *context)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
|
|||
|
||||
device_block_translation(dev);
|
||||
|
||||
if (iommu->agaw < dmar_domain->s2_domain->agaw) {
|
||||
dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
|
||||
* The s2_domain will be used in nested translation, hence needs
|
||||
|
|
|
|||
|
|
@ -483,11 +483,12 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
|
|||
struct dmar_domain *domain,
|
||||
struct device *dev, u32 pasid)
|
||||
{
|
||||
struct pt_iommu_vtdss_hw_info pt_info;
|
||||
struct pasid_entry *pte;
|
||||
struct dma_pte *pgd;
|
||||
u64 pgd_val;
|
||||
u16 did;
|
||||
|
||||
pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
|
||||
|
||||
/*
|
||||
* If hardware advertises no support for second level
|
||||
* translation, return directly.
|
||||
|
|
@ -498,8 +499,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
pgd = domain->pgd;
|
||||
pgd_val = virt_to_phys(pgd);
|
||||
did = domain_id_iommu(domain, iommu);
|
||||
|
||||
spin_lock(&iommu->lock);
|
||||
|
|
@ -514,7 +513,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
|
|||
return -EBUSY;
|
||||
}
|
||||
|
||||
pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw,
|
||||
pasid_pte_config_second_level(iommu, pte, pt_info.ssptptr, pt_info.aw,
|
||||
did, domain->dirty_tracking);
|
||||
spin_unlock(&iommu->lock);
|
||||
|
||||
|
|
@ -528,11 +527,12 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
|
|||
struct device *dev, u16 old_did,
|
||||
u32 pasid)
|
||||
{
|
||||
struct pt_iommu_vtdss_hw_info pt_info;
|
||||
struct pasid_entry *pte, new_pte;
|
||||
struct dma_pte *pgd;
|
||||
u64 pgd_val;
|
||||
u16 did;
|
||||
|
||||
pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
|
||||
|
||||
/*
|
||||
* If hardware advertises no support for second level
|
||||
* translation, return directly.
|
||||
|
|
@ -543,13 +543,10 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
pgd = domain->pgd;
|
||||
pgd_val = virt_to_phys(pgd);
|
||||
did = domain_id_iommu(domain, iommu);
|
||||
|
||||
pasid_pte_config_second_level(iommu, &new_pte, pgd_val,
|
||||
domain->agaw, did,
|
||||
domain->dirty_tracking);
|
||||
pasid_pte_config_second_level(iommu, &new_pte, pt_info.ssptptr,
|
||||
pt_info.aw, did, domain->dirty_tracking);
|
||||
|
||||
spin_lock(&iommu->lock);
|
||||
pte = intel_pasid_get_entry(dev, pasid);
|
||||
|
|
@ -747,10 +744,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
|
|||
struct dmar_domain *s2_domain,
|
||||
u16 did)
|
||||
{
|
||||
struct dma_pte *pgd = s2_domain->pgd;
|
||||
struct pt_iommu_vtdss_hw_info pt_info;
|
||||
|
||||
lockdep_assert_held(&iommu->lock);
|
||||
|
||||
pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
|
||||
|
||||
pasid_clear_entry(pte);
|
||||
|
||||
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
|
||||
|
|
@ -770,10 +769,10 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
|
|||
if (s2_domain->force_snooping)
|
||||
pasid_set_pgsnp(pte);
|
||||
|
||||
pasid_set_slptr(pte, virt_to_phys(pgd));
|
||||
pasid_set_slptr(pte, pt_info.ssptptr);
|
||||
pasid_set_fault_enable(pte);
|
||||
pasid_set_domain_id(pte, did);
|
||||
pasid_set_address_width(pte, s2_domain->agaw);
|
||||
pasid_set_address_width(pte, pt_info.aw);
|
||||
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
|
||||
if (s2_domain->dirty_tracking)
|
||||
pasid_set_ssade(pte);
|
||||
|
|
|
|||
Loading…
Reference in New Issue