x86/mm/64: Make 5-level paging support unconditional

Both Intel and AMD CPUs support 5-level paging, which is expected to
become more widely adopted in the future. All major x86 Linux
distributions have the feature enabled.

Remove CONFIG_X86_5LEVEL and related #ifdeffery for it to make it more readable.

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250516123306.3812286-4-kirill.shutemov@linux.intel.com
This commit is contained in:
Kirill A. Shutemov 2025-05-16 15:33:05 +03:00 committed by Ingo Molnar
parent cba5d9b3e9
commit 7212b58d6d
19 changed files with 10 additions and 102 deletions

View File

@ -173,10 +173,10 @@ For example, when an old kernel is running on new hardware.
The kernel disabled support for it at compile-time The kernel disabled support for it at compile-time
-------------------------------------------------- --------------------------------------------------
For example, if 5-level-paging is not enabled when building (i.e., For example, if Linear Address Masking (LAM) is not enabled when building (i.e.,
CONFIG_X86_5LEVEL is not selected) the flag "la57" will not show up [#f1]_. CONFIG_ADDRESS_MASKING is not selected) the flag "lam" will not show up.
Even though the feature will still be detected via CPUID, the kernel disables Even though the feature will still be detected via CPUID, the kernel disables
it by clearing via setup_clear_cpu_cap(X86_FEATURE_LA57). it by clearing via setup_clear_cpu_cap(X86_FEATURE_LAM).
The feature is disabled at boot-time The feature is disabled at boot-time
------------------------------------ ------------------------------------
@ -200,5 +200,3 @@ missing at runtime. For example, AVX flags will not show up if XSAVE feature
is disabled since they depend on XSAVE feature. Another example would be broken is disabled since they depend on XSAVE feature. Another example would be broken
CPUs and them missing microcode patches. Due to that, the kernel decides not to CPUs and them missing microcode patches. Due to that, the kernel decides not to
enable a feature. enable a feature.
.. [#f1] 5-level paging uses linear address of 57 bits.

View File

@ -22,15 +22,6 @@ QEMU 2.9 and later support 5-level paging.
Virtual memory layout for 5-level paging is described in Virtual memory layout for 5-level paging is described in
Documentation/arch/x86/x86_64/mm.rst Documentation/arch/x86/x86_64/mm.rst
Enabling 5-level paging
=======================
CONFIG_X86_5LEVEL=y enables the feature.
Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
In this case additional page table level -- p4d -- will be folded at
runtime.
User-space and large virtual address space User-space and large virtual address space
========================================== ==========================================
On x86, 5-level paging enables 56-bit userspace virtual address space. On x86, 5-level paging enables 56-bit userspace virtual address space.

View File

@ -427,8 +427,7 @@ config DYNAMIC_PHYSICAL_MASK
config PGTABLE_LEVELS config PGTABLE_LEVELS
int int
default 5 if X86_5LEVEL default 5 if X86_64
default 4 if X86_64
default 3 if X86_PAE default 3 if X86_PAE
default 2 default 2
@ -1464,25 +1463,6 @@ config X86_PAE
has the cost of more pagetable lookup overhead, and also has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process. consumes more pagetable space per process.
config X86_5LEVEL
bool "Enable 5-level page tables support"
default y
depends on X86_64
help
5-level paging enables access to larger address space:
up to 128 PiB of virtual address space and 4 PiB of
physical address space.
It will be supported by future Intel CPUs.
A kernel with the option enabled can be booted on machines that
support 4- or 5-level paging.
See Documentation/arch/x86/x86_64/5level-paging.rst for more
information.
Say N if unsure.
config X86_DIRECT_GBPAGES config X86_DIRECT_GBPAGES
def_bool y def_bool y
depends on X86_64 depends on X86_64

View File

@ -132,10 +132,6 @@ config X86_DISABLED_FEATURE_OSPKE
def_bool y def_bool y
depends on !X86_INTEL_MEMORY_PROTECTION_KEYS depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
config X86_DISABLED_FEATURE_LA57
def_bool y
depends on !X86_5LEVEL
config X86_DISABLED_FEATURE_PTI config X86_DISABLED_FEATURE_PTI
def_bool y def_bool y
depends on !MITIGATION_PAGE_TABLE_ISOLATION depends on !MITIGATION_PAGE_TABLE_ISOLATION

View File

@ -10,12 +10,10 @@
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
#ifdef CONFIG_X86_5LEVEL
/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ /* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */
unsigned int __section(".data") __pgtable_l5_enabled; unsigned int __section(".data") __pgtable_l5_enabled;
unsigned int __section(".data") pgdir_shift = 39; unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1; unsigned int __section(".data") ptrs_per_p4d = 1;
#endif
/* Buffer to preserve trampoline memory */ /* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@ -114,18 +112,13 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
* Check if LA57 is desired and supported. * Check if LA57 is desired and supported.
* *
* There are several parts to the check: * There are several parts to the check:
* - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
* - if user asked to disable 5-level paging: no5lvl in cmdline * - if user asked to disable 5-level paging: no5lvl in cmdline
* - if the machine supports 5-level paging: * - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported * + CPUID leaf 7 is supported
* + the leaf has the feature bit set * + the leaf has the feature bit set
*
* That's substitute for boot_cpu_has() in early boot code.
*/ */
if (IS_ENABLED(CONFIG_X86_5LEVEL) && if (!cmdline_find_option_bool("no5lvl") &&
!cmdline_find_option_bool("no5lvl") && native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) {
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
l5_required = true; l5_required = true;
/* Initialize variables for 5-level paging */ /* Initialize variables for 5-level paging */

View File

@ -361,12 +361,8 @@ xloadflags:
#endif #endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#ifdef CONFIG_X86_5LEVEL
#define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) #define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED)
#else #else
#define XLF56 XLF_5LEVEL
#endif
#else
#define XLF56 0 #define XLF56 0
#endif #endif

View File

@ -16,9 +16,6 @@ extern unsigned int next_early_pgt;
static inline bool check_la57_support(void) static inline bool check_la57_support(void)
{ {
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
return false;
/* /*
* 5-level paging is detected and enabled at kernel decompression * 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there. * stage. Only check if it has been enabled there.
@ -129,7 +126,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
pgd = rip_rel_ptr(early_top_pgt); pgd = rip_rel_ptr(early_top_pgt);
pgd[pgd_index(__START_KERNEL_map)] += load_delta; pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { if (la57) {
p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt); p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
p4d[MAX_PTRS_PER_P4D - 1] += load_delta; p4d[MAX_PTRS_PER_P4D - 1] += load_delta;

View File

@ -341,9 +341,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR); p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
#endif
pud = pud_offset(p4d, VSYSCALL_ADDR); pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR); pmd = pmd_offset(pud, VSYSCALL_ADDR);

View File

@ -62,7 +62,6 @@ static inline void clear_page(void *page)
void copy_page(void *to, void *from); void copy_page(void *to, void *from);
KCFI_REFERENCE(copy_page); KCFI_REFERENCE(copy_page);
#ifdef CONFIG_X86_5LEVEL
/* /*
* User space process size. This is the first address outside the user range. * User space process size. This is the first address outside the user range.
* There are a few constraints that determine this: * There are a few constraints that determine this:
@ -93,7 +92,6 @@ static __always_inline unsigned long task_size_max(void)
return ret; return ret;
} }
#endif /* CONFIG_X86_5LEVEL */
#endif /* !__ASSEMBLER__ */ #endif /* !__ASSEMBLER__ */

View File

@ -48,14 +48,7 @@
/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ /* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */
#define __PHYSICAL_MASK_SHIFT 52 #define __PHYSICAL_MASK_SHIFT 52
#ifdef CONFIG_X86_5LEVEL
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
/* See task_size_max() in <asm/page_64.h> */
#else
#define __VIRTUAL_MASK_SHIFT 47
#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
#endif
#define TASK_SIZE_MAX task_size_max() #define TASK_SIZE_MAX task_size_max()
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)

View File

@ -41,11 +41,9 @@ static inline void sync_initial_page_table(void) { }
pr_err("%s:%d: bad pud %p(%016lx)\n", \ pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e)) __FILE__, __LINE__, &(e), pud_val(e))
#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e) \ #define p4d_ERROR(e) \
pr_err("%s:%d: bad p4d %p(%016lx)\n", \ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
__FILE__, __LINE__, &(e), p4d_val(e)) __FILE__, __LINE__, &(e), p4d_val(e))
#endif
#define pgd_ERROR(e) \ #define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \

View File

@ -23,7 +23,6 @@ typedef struct { pmdval_t pmd; } pmd_t;
extern unsigned int __pgtable_l5_enabled; extern unsigned int __pgtable_l5_enabled;
#ifdef CONFIG_X86_5LEVEL
#ifdef USE_EARLY_PGTABLE_L5 #ifdef USE_EARLY_PGTABLE_L5
/* /*
* cpu_feature_enabled() is not available in early boot code. * cpu_feature_enabled() is not available in early boot code.
@ -37,17 +36,11 @@ static inline bool pgtable_l5_enabled(void)
#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
#endif /* USE_EARLY_PGTABLE_L5 */ #endif /* USE_EARLY_PGTABLE_L5 */
#else
#define pgtable_l5_enabled() 0
#endif /* CONFIG_X86_5LEVEL */
extern unsigned int pgdir_shift; extern unsigned int pgdir_shift;
extern unsigned int ptrs_per_p4d; extern unsigned int ptrs_per_p4d;
#endif /* !__ASSEMBLER__ */ #endif /* !__ASSEMBLER__ */
#ifdef CONFIG_X86_5LEVEL
/* /*
* PGDIR_SHIFT determines what a top-level page table entry can map * PGDIR_SHIFT determines what a top-level page table entry can map
*/ */
@ -65,17 +58,6 @@ extern unsigned int ptrs_per_p4d;
#define MAX_POSSIBLE_PHYSMEM_BITS 52 #define MAX_POSSIBLE_PHYSMEM_BITS 52
#else /* CONFIG_X86_5LEVEL */
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512
#define MAX_PTRS_PER_P4D 1
#endif /* CONFIG_X86_5LEVEL */
/* /*
* 3rd level page * 3rd level page
*/ */

View File

@ -590,7 +590,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
DPRINTK(ALT, "alt table %px, -> %px", start, end); DPRINTK(ALT, "alt table %px, -> %px", start, end);
/* /*
* In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using * KASAN_SHADOW_START is defined using
* cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
* During the process, KASAN becomes confused seeing partial LA57 * During the process, KASAN becomes confused seeing partial LA57
* conversion and triggers a false-positive out-of-bound report. * conversion and triggers a false-positive out-of-bound report.

View File

@ -51,13 +51,11 @@ unsigned int __initdata next_early_pgt;
SYM_PIC_ALIAS(next_early_pgt); SYM_PIC_ALIAS(next_early_pgt);
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int __pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39; unsigned int pgdir_shift __ro_after_init = 39;
EXPORT_SYMBOL(pgdir_shift); EXPORT_SYMBOL(pgdir_shift);
unsigned int ptrs_per_p4d __ro_after_init = 1; unsigned int ptrs_per_p4d __ro_after_init = 1;
EXPORT_SYMBOL(ptrs_per_p4d); EXPORT_SYMBOL(ptrs_per_p4d);
#endif
unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
EXPORT_SYMBOL(page_offset_base); EXPORT_SYMBOL(page_offset_base);

View File

@ -649,13 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
SYM_DATA_END(init_top_pgt) SYM_DATA_END(init_top_pgt)
#endif #endif
#ifdef CONFIG_X86_5LEVEL
SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0 .fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level4_kernel_pgt) SYM_DATA_END(level4_kernel_pgt)
SYM_PIC_ALIAS(level4_kernel_pgt) SYM_PIC_ALIAS(level4_kernel_pgt)
#endif
SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0 .fill L3_START_KERNEL,8,0

View File

@ -174,11 +174,7 @@ __ref void *alloc_low_pages(unsigned int num)
* randomization is enabled. * randomization is enabled.
*/ */
#ifndef CONFIG_X86_5LEVEL
#define INIT_PGD_PAGE_TABLES 3
#else
#define INIT_PGD_PAGE_TABLES 4 #define INIT_PGD_PAGE_TABLES 4
#endif
#ifndef CONFIG_RANDOMIZE_MEMORY #ifndef CONFIG_RANDOMIZE_MEMORY
#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)

View File

@ -592,7 +592,7 @@ void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
} }
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL #if CONFIG_PGTABLE_LEVELS > 4
/** /**
* p4d_set_huge - Set up kernel P4D mapping * p4d_set_huge - Set up kernel P4D mapping
* @p4d: Pointer to the P4D entry * @p4d: Pointer to the P4D entry

View File

@ -578,7 +578,6 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
xen_mc_issue(XEN_LAZY_MMU); xen_mc_issue(XEN_LAZY_MMU);
} }
#if CONFIG_PGTABLE_LEVELS >= 5
__visible p4dval_t xen_p4d_val(p4d_t p4d) __visible p4dval_t xen_p4d_val(p4d_t p4d)
{ {
return pte_mfn_to_pfn(p4d.p4d); return pte_mfn_to_pfn(p4d.p4d);
@ -592,7 +591,6 @@ __visible p4d_t xen_make_p4d(p4dval_t p4d)
return native_make_p4d(p4d); return native_make_p4d(p4d);
} }
PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
void (*func)(struct mm_struct *mm, struct page *, void (*func)(struct mm_struct *mm, struct page *,
@ -2222,10 +2220,8 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.alloc_pud = xen_alloc_pmd_init, .alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init, .release_pud = xen_release_pmd_init,
#if CONFIG_PGTABLE_LEVELS >= 5
.p4d_val = PV_CALLEE_SAVE(xen_p4d_val), .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d), .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
.enter_mmap = xen_enter_mmap, .enter_mmap = xen_enter_mmap,
.exit_mmap = xen_exit_mmap, .exit_mmap = xen_exit_mmap,

View File

@ -62,7 +62,7 @@ efi_status_t efi_setup_5level_paging(void)
void efi_5level_switch(void) void efi_5level_switch(void)
{ {
bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl; bool want_la57 = !efi_no5lvl;
bool have_la57 = native_read_cr4() & X86_CR4_LA57; bool have_la57 = native_read_cr4() & X86_CR4_LA57;
bool need_toggle = want_la57 ^ have_la57; bool need_toggle = want_la57 ^ have_la57;
u64 *pgt = (void *)la57_toggle + PAGE_SIZE; u64 *pgt = (void *)la57_toggle + PAGE_SIZE;