diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 936c327f0661..2016c5e5ac0f 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y CONFIG_IOMMUFD=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 81652cd9c69f..6dcb771b3c58 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,10 +42,21 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_KUNIT_TEST tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index f0c22cf5f7be..5a3379107999 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + IOMMU_PT_KUNIT_TEST := define create_format obj-$(2) += iommu_$(1).o diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5c5960d871a3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..18d736d14b2d --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->common.hw_max_vasz_lg2 < 31 || + cfg->common.hw_max_vasz_lg2 > 57) + return -EINVAL; + + /* Top of 2, 3, 4 */ + pt_top_set_level(&table->common, + (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47 }, + [3] = { .common.hw_max_vasz_lg2 = 56 }, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 21e33489cbf2..96f8a6a7d60e 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -151,4 +151,17 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_x86_64 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index f2a763aba088..fde7ccf007c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -255,6 +255,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_x86_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_x86_64_hw_info { + u64 gcr3_pt; + u8 levels; +}; + +IOMMU_FORMAT(x86_64, x86_64_pt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif