From 22cc5ca5de52bbfc36a7d4a55323f91fb4492264 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 28 Feb 2025 01:44:14 +0000 Subject: [PATCH 1/3] x86/paravirt: Move halt paravirt calls under CONFIG_PARAVIRT CONFIG_PARAVIRT_XXL is mainly defined/used by XEN PV guests. For other VM guest types, features supported under CONFIG_PARAVIRT are self sufficient. CONFIG_PARAVIRT mainly provides support for TLB flush operations and time related operations. For TDX guest as well, paravirt calls under CONFIG_PARVIRT meets most of its requirement except the need of HLT and SAFE_HLT paravirt calls, which is currently defined under CONFIG_PARAVIRT_XXL. Since enabling CONFIG_PARAVIRT_XXL is too bloated for TDX guest like platforms, move HLT and SAFE_HLT paravirt calls under CONFIG_PARAVIRT. Moving HLT and SAFE_HLT paravirt calls are not fatal and should not break any functionality for current users of CONFIG_PARAVIRT. Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Vishal Annapurve Signed-off-by: Ingo Molnar Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Juergen Gross Tested-by: Ryan Afranji Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: stable@kernel.org Link: https://lore.kernel.org/r/20250228014416.3925664-2-vannapurve@google.com --- arch/x86/include/asm/irqflags.h | 40 +++++++++++++++------------ arch/x86/include/asm/paravirt.h | 20 +++++++------- arch/x86/include/asm/paravirt_types.h | 3 +- arch/x86/kernel/paravirt.c | 14 ++++++---- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index abb8374c9ff7..9a9b21b78905 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -76,6 +76,28 @@ static __always_inline void native_local_irq_restore(unsigned long flags) #endif +#ifndef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static __always_inline void arch_safe_halt(void) +{ + native_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static __always_inline void halt(void) +{ + native_halt(); +} +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + #ifdef CONFIG_PARAVIRT_XXL #include #else @@ -97,24 +119,6 @@ static __always_inline void arch_local_irq_enable(void) native_irq_enable(); } -/* - * Used in the idle loop; sti takes one instruction cycle - * to complete: - */ -static __always_inline void arch_safe_halt(void) -{ - native_safe_halt(); -} - -/* - * Used when interrupts are already enabled or to - * shutdown the processor: - */ -static __always_inline void halt(void) -{ - native_halt(); -} - /* * For spinlocks, etc: */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index bed346bfac89..c4c23190925c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -102,6 +102,16 @@ static inline void notify_page_enc_status_changed(unsigned long pfn, PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } +static __always_inline void arch_safe_halt(void) +{ + PVOP_VCALL0(irq.safe_halt); +} + +static inline void halt(void) +{ + PVOP_VCALL0(irq.halt); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { @@ -165,16 +175,6 @@ static inline void __write_cr4(unsigned long x) PVOP_VCALL1(cpu.write_cr4, x); } -static __always_inline void arch_safe_halt(void) -{ - PVOP_VCALL0(irq.safe_halt); -} - -static inline void halt(void) -{ - PVOP_VCALL0(irq.halt); -} - static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 62912023b46f..631c306ce1ff 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -120,10 +120,9 @@ struct pv_irq_ops { struct paravirt_callee_save save_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; - +#endif void (*safe_halt)(void); void (*halt)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97925632c28e..1ccd05d8999f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -75,6 +75,11 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } +static noinstr void pv_native_safe_halt(void) +{ + native_safe_halt(); +} + #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { @@ -100,11 +105,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } - -static noinstr void pv_native_safe_halt(void) -{ - native_safe_halt(); -} #endif struct pv_info pv_info = { @@ -161,9 +161,11 @@ struct paravirt_patch_template pv_ops = { .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, -#endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 28 Feb 2025 01:44:15 +0000 Subject: [PATCH 2/3] x86/tdx: Fix arch_safe_halt() execution for TDX VMs Direct HLT instruction execution causes #VEs for TDX VMs which is routed to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE handler will enable interrupts before TDCALL is routed to hypervisor leading to missed wakeup events, as current TDX spec doesn't expose interruptibility state information to allow #VE handler to selectively enable interrupts. Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") prevented the idle routines from executing HLT instruction in STI-shadow. But it missed the paravirt routine which can be reached via this path as an example: kvm_wait() => safe_halt() => raw_safe_halt() => arch_safe_halt() => irq.safe_halt() => pv_native_safe_halt() To reliably handle arch_safe_halt() for TDX VMs, introduce explicit dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt() routines with TDX-safe versions that execute direct TDCALL and needed interrupt flag updates. Executing direct TDCALL brings in additional benefit of avoiding HLT related #VEs altogether. As tested by Ryan Afranji: "Tested with the specjbb2015 benchmark. It has heavy lock contention which leads to many halt calls. TDX VMs suffered a poor score before this patchset. Verified the major performance improvement with this patchset applied." Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") Signed-off-by: Vishal Annapurve Signed-off-by: Ingo Molnar Reviewed-by: Kirill A. Shutemov Tested-by: Ryan Afranji Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com --- arch/x86/Kconfig | 1 + arch/x86/coco/tdx/tdx.c | 26 +++++++++++++++++++++++++- arch/x86/include/asm/tdx.h | 4 ++-- arch/x86/kernel/process.c | 2 +- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 05b4eca156cf..f614c0522a0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -878,6 +878,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC depends on EFI_STUB + depends on PARAVIRT select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT select X86_MCE diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 7772b01ab738..aa0eb4057226 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -398,7 +399,7 @@ static int handle_halt(struct ve_info *ve) return ve_instr_len(ve); } -void __cpuidle tdx_safe_halt(void) +void __cpuidle tdx_halt(void) { const bool irq_disabled = false; @@ -409,6 +410,16 @@ void __cpuidle tdx_safe_halt(void) WARN_ONCE(1, "HLT instruction emulation failed\n"); } +static void __cpuidle tdx_safe_halt(void) +{ + tdx_halt(); + /* + * "__cpuidle" section doesn't support instrumentation, so stick + * with raw_* variant that avoids tracing hooks. + */ + raw_local_irq_enable(); +} + static int read_msr(struct pt_regs *regs, struct ve_info *ve) { struct tdx_module_args args = { @@ -1109,6 +1120,19 @@ void __init tdx_early_init(void) x86_platform.guest.enc_kexec_begin = tdx_kexec_begin; x86_platform.guest.enc_kexec_finish = tdx_kexec_finish; + /* + * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that + * will enable interrupts before HLT TDCALL invocation if executed + * in STI-shadow, possibly resulting in missed wakeup events. + * + * Modify all possible HLT execution paths to use TDX specific routines + * that directly execute TDCALL and toggle the interrupt state as + * needed after TDCALL completion. This also reduces HLT related #VEs + * in addition to having a reliable halt logic execution. + */ + pv_ops.irq.safe_halt = tdx_safe_halt; + pv_ops.irq.halt = tdx_halt; + /* * TDX intercepts the RDMSR to read the X2APIC ID in the parallel * bringup low level code. That raises #VE which cannot be handled diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 65394aa9b49f..4a1922ec80cf 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -void tdx_safe_halt(void); +void tdx_halt(void); bool tdx_early_handle_ve(struct pt_regs *regs); @@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls); #else static inline void tdx_early_init(void) { }; -static inline void tdx_safe_halt(void) { }; +static inline void tdx_halt(void) { }; static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 91f6ff618852..962c3ce39323 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -939,7 +939,7 @@ void __init select_idle_routine(void) static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); - static_call_update(x86_idle, tdx_safe_halt); + static_call_update(x86_idle, tdx_halt); } else { static_call_update(x86_idle, default_idle); } From e8f45927ee5d99fa52f14205a2c7ac3820c64457 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 28 Feb 2025 01:44:16 +0000 Subject: [PATCH 3/3] x86/tdx: Emit warning if IRQs are enabled during HLT #VE handling Direct HLT instruction execution causes #VEs for TDX VMs which is routed to hypervisor via TDCALL. safe_halt() routines execute HLT in STI-shadow so IRQs need to remain disabled until the TDCALL to ensure that pending IRQs are correctly treated as wake events. Emit warning and fail emulation if IRQs are enabled during HLT #VE handling to avoid running into scenarios where IRQ wake events are lost resulting in indefinite HLT execution times. Signed-off-by: Vishal Annapurve Signed-off-by: Ingo Molnar Reviewed-by: Kirill A. Shutemov Tested-by: Ryan Afranji Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Josh Poimboeuf Link: https://lore.kernel.org/r/20250228014416.3925664-4-vannapurve@google.com --- arch/x86/coco/tdx/tdx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index aa0eb4057226..edab6d6049be 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -393,6 +393,14 @@ static int handle_halt(struct ve_info *ve) { const bool irq_disabled = irqs_disabled(); + /* + * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a + * wake event may be consumed before requesting HLT emulation, leaving + * the vCPU blocking indefinitely. + */ + if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled")) + return -EIO; + if (__halt(irq_disabled)) return -EIO;