iommupt: Add the Intel VT-d second stage page table format

The VT-d second stage format is almost the same as the x86 PAE format,
except the bit encodings in the PTE are different and a few new PTE
features, like force coherency are present.

Among all the formats it is unique in not having a designated present bit.

Comparing the performance of several operations to the existing version:

iommu_map()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     53,66    ,      50,64      ,  21.21
     2^21,     59,70    ,      56,67      ,  16.16
     2^30,     54,66    ,      52,63      ,  17.17
 256*2^12,    384,524   ,     337,516     ,  34.34
 256*2^21,    387,632   ,     336,626     ,  46.46
 256*2^30,    376,629   ,     323,623     ,  48.48

iommu_unmap()
   pgsz  ,avg new,old ns, min new,old ns  , min % (+ve is better)
     2^12,     67,86    ,      63,84      ,  25.25
     2^21,     64,84    ,      59,80      ,  26.26
     2^30,     59,78    ,      56,74      ,  24.24
 256*2^12,    216,335   ,     198,317     ,  37.37
 256*2^21,    245,350   ,     232,344     ,  32.32
 256*2^30,    248,345   ,     226,339     ,  33.33

Cc: Tina Zhang <tina.zhang@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
This commit is contained in:
Jason Gunthorpe 2025-10-23 15:22:33 -03:00 committed by Joerg Roedel
parent efa03dab7c
commit 5448c1558f
8 changed files with 366 additions and 0 deletions

View File

@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y
CONFIG_DEBUG_GENERIC_PT=y
CONFIG_IOMMU_PT=y
CONFIG_IOMMU_PT_AMDV1=y
CONFIG_IOMMU_PT_VTDSS=y
CONFIG_IOMMU_PT_X86_64=y
CONFIG_IOMMU_PT_KUNIT_TEST=y

View File

@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1
Selected automatically by an IOMMU driver that uses this format.
config IOMMU_PT_VTDSS
tristate "IOMMU page table for Intel VT-d Second Stage"
depends on !GENERIC_ATOMIC64 # for cmpxchg64
help
iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
level Second Stage page table. It is similar to the X86_64 format with
4K/2M/1G page sizes.
Selected automatically by an IOMMU driver that uses this format.
config IOMMU_PT_X86_64
tristate "IOMMU page table for x86 64-bit, 4/5 levels"
depends on !GENERIC_ATOMIC64 # for cmpxchg64
@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST
depends on KUNIT
depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
default KUNIT_ALL_TESTS
help
Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the

View File

@ -3,6 +3,8 @@
iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
IOMMU_PT_KUNIT_TEST :=

View File

@ -0,0 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
*
*/
#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
#define __GENERIC_PT_FMT_DEFS_VTDSS_H
#include <linux/generic_pt/common.h>
#include <linux/types.h>
typedef u64 pt_vaddr_t;
typedef u64 pt_oaddr_t;
struct vtdss_pt_write_attrs {
u64 descriptor_bits;
gfp_t gfp;
};
#define pt_write_attrs vtdss_pt_write_attrs
#endif

View File

@ -0,0 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
*/
#define PT_FMT vtdss
#define PT_SUPPORTED_FEATURES \
(BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
#include "iommu_template.h"

View File

@ -0,0 +1,292 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
*
* Intel VT-d Second Stange 5/4 level page table
*
* This is described in
* Section "3.7 Second-Stage Translation"
* Section "9.8 Second-Stage Paging Entries"
*
* Of the "Intel Virtualization Technology for Directed I/O Architecture
* Specification".
*
* The named levels in the spec map to the pts->level as:
* Table/SS-PTE - 0
* Directory/SS-PDE - 1
* Directory Ptr/SS-PDPTE - 2
* PML4/SS-PML4E - 3
* PML5/SS-PML5E - 4
*/
#ifndef __GENERIC_PT_FMT_VTDSS_H
#define __GENERIC_PT_FMT_VTDSS_H
#include "defs_vtdss.h"
#include "../pt_defs.h"
#include <linux/bitfield.h>
#include <linux/container_of.h>
#include <linux/log2.h>
enum {
PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
PT_MAX_VA_ADDRESS_LG2 = 57,
PT_ITEM_WORD_SIZE = sizeof(u64),
PT_MAX_TOP_LEVEL = 4,
PT_GRANULE_LG2SZ = 12,
PT_TABLEMEM_LG2SZ = 12,
/* SSPTPTR is 4k aligned and limited by HAW */
PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
};
/* Shared descriptor bits */
enum {
VTDSS_FMT_R = BIT(0),
VTDSS_FMT_W = BIT(1),
VTDSS_FMT_A = BIT(8),
VTDSS_FMT_D = BIT(9),
VTDSS_FMT_SNP = BIT(11),
VTDSS_FMT_OA = GENMASK_ULL(51, 12),
};
/* PDPTE/PDE */
enum {
VTDSS_FMT_PS = BIT(7),
};
#define common_to_vtdss_pt(common_ptr) \
container_of_const(common_ptr, struct pt_vtdss, common)
#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
{
return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
PT_TABLEMEM_LG2SZ);
}
#define pt_table_pa vtdss_pt_table_pa
static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
{
return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
PT_GRANULE_LG2SZ);
}
#define pt_entry_oa vtdss_pt_entry_oa
static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
{
return pts->level <= 2;
}
#define pt_can_have_leaf vtdss_pt_can_have_leaf
static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
{
return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
}
#define pt_num_items_lg2 vtdss_pt_num_items_lg2
static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
{
const u64 *tablep = pt_cur_table(pts, u64);
u64 entry;
pts->entry = entry = READ_ONCE(tablep[pts->index]);
if (!entry)
return PT_ENTRY_EMPTY;
if (pts->level == 0 ||
(vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
return PT_ENTRY_OA;
return PT_ENTRY_TABLE;
}
#define pt_load_entry_raw vtdss_pt_load_entry_raw
static inline void
vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
unsigned int oasz_lg2,
const struct pt_write_attrs *attrs)
{
u64 *tablep = pt_cur_table(pts, u64);
u64 entry;
if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
return;
entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
attrs->descriptor_bits;
if (pts->level != 0)
entry |= VTDSS_FMT_PS;
WRITE_ONCE(tablep[pts->index], entry);
pts->entry = entry;
}
#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
static inline bool vtdss_pt_install_table(struct pt_state *pts,
pt_oaddr_t table_pa,
const struct pt_write_attrs *attrs)
{
u64 entry;
entry = VTDSS_FMT_R | VTDSS_FMT_W |
FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
return pt_table_install64(pts, entry);
}
#define pt_install_table vtdss_pt_install_table
static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
struct pt_write_attrs *attrs)
{
attrs->descriptor_bits = pts->entry &
(VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
}
#define pt_attr_from_entry vtdss_pt_attr_from_entry
static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
{
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
return READ_ONCE(*tablep) & VTDSS_FMT_D;
}
#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
{
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
}
#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
{
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
u64 new = pts->entry | VTDSS_FMT_D;
return try_cmpxchg64(tablep, &pts->entry, new);
}
#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
{
return 10;
}
#define pt_max_sw_bit vtdss_pt_max_sw_bit
static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
{
/* Bits marked Ignored in the specification */
switch (bitnr) {
case 0:
return BIT(10);
case 1 ... 9:
return BIT_ULL((bitnr - 1) + 52);
case 10:
return BIT_ULL(63);
/* Some bits in 9-3 are available in some entries */
default:
if (__builtin_constant_p(bitnr))
BUILD_BUG();
else
PT_WARN_ON(true);
return 0;
}
}
#define pt_sw_bit vtdss_pt_sw_bit
/* --- iommu */
#include <linux/generic_pt/iommu.h>
#include <linux/iommu.h>
#define pt_iommu_table pt_iommu_vtdss
/* The common struct is in the per-format common struct */
static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
{
return &container_of(iommu_table, struct pt_iommu_table, iommu)
->vtdss_pt.common;
}
static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
{
return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
->iommu;
}
static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
struct pt_write_attrs *attrs,
unsigned int iommu_prot)
{
u64 pte = 0;
/*
* VTDSS does not have a present bit, so we tell if any entry is present
* by checking for R or W.
*/
if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
return -EINVAL;
if (iommu_prot & IOMMU_READ)
pte |= VTDSS_FMT_R;
if (iommu_prot & IOMMU_WRITE)
pte |= VTDSS_FMT_W;
if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
pte |= VTDSS_FMT_SNP;
if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
!(iommu_prot & IOMMU_WRITE)) {
pr_err_ratelimited(
"Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
return -EINVAL;
}
attrs->descriptor_bits = pte;
return 0;
}
#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
const struct pt_iommu_vtdss_cfg *cfg)
{
struct pt_vtdss *table = &iommu_table->vtdss_pt;
unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2;
if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2)
return -EOPNOTSUPP;
else if (vasz_lg2 > 48)
pt_top_set_level(&table->common, 4);
else if (vasz_lg2 > 39)
pt_top_set_level(&table->common, 3);
else if (vasz_lg2 > 30)
pt_top_set_level(&table->common, 2);
else
return -EOPNOTSUPP;
return 0;
}
#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
static inline void
vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
const struct pt_range *top_range,
struct pt_iommu_vtdss_hw_info *info)
{
info->ssptptr = virt_to_phys(top_range->top_table);
PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
/*
* top_level = 2 = 3 level table aw=1
* top_level = 3 = 4 level table aw=2
* top_level = 4 = 5 level table aw=3
*/
info->aw = top_range->top_level - 1;
}
#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
#if defined(GENERIC_PT_KUNIT)
static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
[0] = { .common.hw_max_vasz_lg2 = 39 },
[1] = { .common.hw_max_vasz_lg2 = 48 },
[2] = { .common.hw_max_vasz_lg2 = 57 },
};
#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
#endif
#endif

View File

@ -157,6 +157,24 @@ enum {
PT_FEAT_AMDV1_FORCE_COHERENCE,
};
struct pt_vtdss {
struct pt_common common;
};
enum {
/*
* The PTEs are set to prevent cache incoherent traffic, such as PCI no
* snoop. This is set either at creation time or before the first map
* operation.
*/
PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
/*
* Prevent creating read-only PTEs. Used to work around HW errata
* ERRATA_772415_SPR17.
*/
PT_FEAT_VTDSS_FORCE_WRITEABLE,
};
struct pt_x86_64 {
struct pt_common common;
};

View File

@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt);
struct pt_iommu_amdv1_mock_hw_info;
IOMMU_PROTOTYPES(amdv1_mock);
struct pt_iommu_vtdss_cfg {
struct pt_iommu_cfg common;
};
struct pt_iommu_vtdss_hw_info {
u64 ssptptr;
u8 aw;
};
IOMMU_FORMAT(vtdss, vtdss_pt);
struct pt_iommu_x86_64_cfg {
struct pt_iommu_cfg common;
};