* Avoid direct HLT instruction execution in TDX guests

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEV76QKkVc4xCGURexaDWVMHDJkrAFAmfsa8QACgkQaDWVMHDJ
 krBCAhAAodPYiIEy+qpad1Q8HPhaKYUJ5jzkIdt1GYXCBf2dfY6Zj8w7edSApUhA
 7og9gK8ku8hwpf6oCGmp2Lm74FgATIj7q0ac07XBW3OsrfFQc73DfPJn6WMDYjRV
 ec9baSzX5GcqUyezq7woyJayZT9LRLBexF/vk7dAQ7nuecCOUhqLXWBN5eUT0e+K
 58kFjZoZZx/4Y9zh7UIxBQyCbL88IeI6rclW5tZJlRHNuD7B64x606ETwQJKK9GK
 YHPhqRKtjJRzSOn/xGYT4AQDPbF9u14Q4WGVO+bvgv8Z6BtmiYV2fG0q5GU14h0z
 +gwjja3Edo+F6zSIIZonQbrSVHspwm1IPJQQZHljhFOEt7Ezu3hLIYouUWVlNRgl
 mRzubZBmhQUfJOAtfGmHktdg6j+QinYDQr+/CjoXoeh8EknL+KtqamXJnyb8KAMN
 qH6X+N2coaCcl334zW44m6YTmTipdIhmHFj6edYwqdR3Ux6DDaX9PKopIIpiZEcb
 GH1o++4JMp9OBIaTu0Yp1WgWJ+EyUSWDJbydqCMOdthuESqKW45IQkLhPxZpIhB4
 5Wra4Ot7AdsThyPqNPaEu3ND+BXu4tAAa8r8GK+AP7DqRxXz/bbWTHqNepm9wSvP
 pnOlLyVTri/difMWWsJJPK6QRYbNnemrny3Do3PbIZVKS08vgLs=
 =XvoD
 -----END PGP SIGNATURE-----

Merge tag 'x86_tdx_for_6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 TDX updates from Dave Hansen:
 "Avoid direct HLT instruction execution in TDX guests.

  TDX guests aren't expected to use the HLT instruction directly. It
  causes a virtualization exception (#VE). While the #VE _can_ be
  handled, the current handling is slow and buggy and the easiest thing
  is just to avoid HLT in the first place. Plus, the kernel already has
  paravirt infrastructure that makes it relatively painless.

  Make TDX guests require paravirt and add some TDX-specific paravirt
  handlers which avoid HLT in the normal halt routines. Also add a
  warning in case another HLT sneaks in.

  There was a report that this leads to a "major performance
  improvement" on specjbb2015, probably because of the extra #VE
  overhead or missed wakeups from the buggy HLT handling"

* tag 'x86_tdx_for_6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/tdx: Emit warning if IRQs are enabled during HLT #VE handling
  x86/tdx: Fix arch_safe_halt() execution for TDX VMs
  x86/paravirt: Move halt paravirt calls under CONFIG_PARAVIRT
This commit is contained in:
Linus Torvalds 2025-04-02 11:33:20 -07:00
commit 6cb094583a
8 changed files with 78 additions and 40 deletions

View File

@ -889,6 +889,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
depends on PARAVIRT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE

View File

@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@ -392,13 +393,21 @@ static int handle_halt(struct ve_info *ve)
{
const bool irq_disabled = irqs_disabled();
/*
* HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a
* wake event may be consumed before requesting HLT emulation, leaving
* the vCPU blocking indefinitely.
*/
if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled"))
return -EIO;
if (__halt(irq_disabled))
return -EIO;
return ve_instr_len(ve);
}
void __cpuidle tdx_safe_halt(void)
void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@ -409,6 +418,16 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
static void __cpuidle tdx_safe_halt(void)
{
tdx_halt();
/*
* "__cpuidle" section doesn't support instrumentation, so stick
* with raw_* variant that avoids tracing hooks.
*/
raw_local_irq_enable();
}
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@ -1109,6 +1128,19 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
/*
* Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
* will enable interrupts before HLT TDCALL invocation if executed
* in STI-shadow, possibly resulting in missed wakeup events.
*
* Modify all possible HLT execution paths to use TDX specific routines
* that directly execute TDCALL and toggle the interrupt state as
* needed after TDCALL completion. This also reduces HLT related #VEs
* in addition to having a reliable halt logic execution.
*/
pv_ops.irq.safe_halt = tdx_safe_halt;
pv_ops.irq.halt = tdx_halt;
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled

View File

@ -76,6 +76,28 @@ static __always_inline void native_local_irq_restore(unsigned long flags)
#endif
#ifndef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
/*
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
static __always_inline void arch_safe_halt(void)
{
native_safe_halt();
}
/*
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
static __always_inline void halt(void)
{
native_halt();
}
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
@ -97,24 +119,6 @@ static __always_inline void arch_local_irq_enable(void)
native_irq_enable();
}
/*
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
static __always_inline void arch_safe_halt(void)
{
native_safe_halt();
}
/*
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
static __always_inline void halt(void)
{
native_halt();
}
/*
* For spinlocks, etc:
*/

View File

@ -102,6 +102,16 @@ static inline void notify_page_enc_status_changed(unsigned long pfn,
PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
}
static __always_inline void arch_safe_halt(void)
{
PVOP_VCALL0(irq.safe_halt);
}
static inline void halt(void)
{
PVOP_VCALL0(irq.halt);
}
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@ -165,16 +175,6 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}
static __always_inline void arch_safe_halt(void)
{
PVOP_VCALL0(irq.safe_halt);
}
static inline void halt(void)
{
PVOP_VCALL0(irq.halt);
}
static inline u64 paravirt_read_msr(unsigned msr)
{
return PVOP_CALL1(u64, cpu.read_msr, msr);

View File

@ -120,10 +120,9 @@ struct pv_irq_ops {
struct paravirt_callee_save save_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
#endif
void (*safe_halt)(void);
void (*halt)(void);
#endif
} __no_randomize_layout;
struct pv_mmu_ops {

View File

@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
void tdx_safe_halt(void);
void tdx_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
static inline void tdx_safe_halt(void) { };
static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }

View File

@ -75,6 +75,11 @@ void paravirt_set_sched_clock(u64 (*func)(void))
static_call_update(pv_sched_clock, func);
}
static noinstr void pv_native_safe_halt(void)
{
native_safe_halt();
}
#ifdef CONFIG_PARAVIRT_XXL
static noinstr void pv_native_write_cr2(unsigned long val)
{
@ -100,11 +105,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
native_set_debugreg(regno, val);
}
static noinstr void pv_native_safe_halt(void)
{
native_safe_halt();
}
#endif
struct pv_info pv_info = {
@ -161,9 +161,11 @@ struct paravirt_patch_template pv_ops = {
.irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
#endif /* CONFIG_PARAVIRT_XXL */
/* Irq HLT ops. */
.irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb_local,

View File

@ -939,7 +939,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
static_call_update(x86_idle, tdx_safe_halt);
static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}