KVM: arm64: nv: Honor SError exception routing / masking

To date KVM has used HCR_EL2.VSE to track the state of a pending SError
for the guest. With this bit set, hardware respects the EL1 exception
routing / masking rules and injects the vSError when appropriate.

This isn't correct for NV guests as hardware is oblivious to vEL2's
intentions for SErrors. Better yet, with FEAT_NV2 the guest can change
the routing behind our back as HCR_EL2 is redirected to memory. Cope
with this mess by:

 - Using a flag (instead of HCR_EL2.VSE) to track the pending SError
   state when SErrors are unconditionally masked for the current context

 - Resampling the routing / masking of a pending SError on every guest
   entry/exit

 - Emulating exception entry when SError routing implies a translation
   regime change

Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250708172532.1699409-7-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
This commit is contained in:
Oliver Upton 2025-07-08 10:25:11 -07:00
parent 9aba641b9e
commit 77ee70a073
11 changed files with 144 additions and 38 deletions

View File

@ -45,7 +45,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu);
void kvm_inject_undefined(struct kvm_vcpu *vcpu);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr);
int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
@ -59,12 +59,25 @@ static inline int kvm_inject_sea_iabt(struct kvm_vcpu *vcpu, u64 addr)
return kvm_inject_sea(vcpu, true, addr);
}
static inline int kvm_inject_serror(struct kvm_vcpu *vcpu)
{
/*
* ESR_ELx.ISV (later renamed to IDS) indicates whether or not
* ESR_ELx.ISS contains IMPLEMENTATION DEFINED syndrome information.
*
* Set the bit when injecting an SError w/o an ESR to indicate ISS
* does not follow the architected format.
*/
return kvm_inject_serror_esr(vcpu, ESR_ELx_ISV);
}
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr);
int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr);
static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu)
{
@ -205,6 +218,11 @@ static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)
return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE;
}
static inline bool vcpu_el2_amo_is_set(const struct kvm_vcpu *vcpu)
{
return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_AMO;
}
static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
{
bool e2h, tge;

View File

@ -817,7 +817,7 @@ struct kvm_vcpu_arch {
u8 iflags;
/* State flags for kernel bookkeeping, unused by the hypervisor code */
u8 sflags;
u16 sflags;
/*
* Don't run the guest (internal implementation need).
@ -953,9 +953,21 @@ struct kvm_vcpu_arch {
__vcpu_flags_preempt_enable(); \
} while (0)
#define __vcpu_test_and_clear_flag(v, flagset, f, m) \
({ \
typeof(v->arch.flagset) set; \
\
set = __vcpu_get_flag(v, flagset, f, m); \
__vcpu_clear_flag(v, flagset, f, m); \
\
set; \
})
#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__)
#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__)
#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__)
#define vcpu_test_and_clear_flag(v, ...) \
__vcpu_test_and_clear_flag((v), __VA_ARGS__)
/* KVM_ARM_VCPU_INIT completed */
#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0))
@ -1015,6 +1027,8 @@ struct kvm_vcpu_arch {
#define IN_WFI __vcpu_single_flag(sflags, BIT(6))
/* KVM is currently emulating a nested ERET */
#define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7))
/* SError pending for nested guest */
#define NESTED_SERROR_PENDING __vcpu_single_flag(sflags, BIT(8))
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@ -1387,8 +1401,6 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
return (vcpu_arch->steal.base != INVALID_GPA);
}
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);

View File

@ -80,6 +80,8 @@ extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu);
extern void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu);
extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu);
struct kvm_s2_trans {
phys_addr_t output;

View File

@ -1188,6 +1188,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
preempt_disable();
kvm_nested_flush_hwstate(vcpu);
if (kvm_vcpu_has_pmu(vcpu))
kvm_pmu_flush_hwstate(vcpu);
@ -1287,6 +1289,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Exit types that need handling before we can be preempted */
handle_exit_early(vcpu, ret);
kvm_nested_sync_hwstate(vcpu);
preempt_enable();
/*

View File

@ -2714,6 +2714,9 @@ static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
case except_type_irq:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
break;
case except_type_serror:
kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR);
break;
default:
WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
}
@ -2821,3 +2824,14 @@ int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
return kvm_inject_nested_sync(vcpu, esr);
}
int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr)
{
/*
* Hardware sets up the EC field when propagating ESR as a result of
* vSError injection. Manually populate EC for an emulated SError
* exception.
*/
esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR);
return kvm_inject_nested(vcpu, esr, except_type_serror);
}

View File

@ -818,8 +818,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
events->exception.serror_pending = (vcpu->arch.hcr_el2 & HCR_VSE) ||
vcpu_get_flag(vcpu, NESTED_SERROR_PENDING);
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
@ -839,23 +840,29 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
bool ext_dabt_pending = events->exception.ext_dabt_pending;
u64 esr = events->exception.serror_esr;
int ret = 0;
if (serror_pending && has_esr) {
if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
return -EINVAL;
if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
kvm_set_sei_esr(vcpu, events->exception.serror_esr);
else
return -EINVAL;
} else if (serror_pending) {
kvm_inject_vabt(vcpu);
}
if (ext_dabt_pending)
ret = kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
if (ret < 0)
return ret;
if (!serror_pending)
return 0;
if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && has_esr)
return -EINVAL;
if (has_esr && (esr & ~ESR_ELx_ISS_MASK))
return -EINVAL;
if (has_esr)
ret = kvm_inject_serror_esr(vcpu, esr);
else
ret = kvm_inject_serror(vcpu);
return (ret < 0) ? ret : 0;
}

View File

@ -32,7 +32,7 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *);
static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
kvm_inject_vabt(vcpu);
kvm_inject_serror(vcpu);
}
static int handle_hvc(struct kvm_vcpu *vcpu)
@ -490,7 +490,7 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
} else {
kvm_inject_vabt(vcpu);
kvm_inject_serror(vcpu);
}
return;

View File

@ -347,9 +347,13 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
break;
case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR):
enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror);
break;
default:
/*
* Only EL1_SYNC and EL2_{SYNC,IRQ} makes
* Only EL1_SYNC and EL2_{SYNC,IRQ,SERR} makes
* sense so far. Everything else gets silently
* ignored.
*/

View File

@ -219,25 +219,30 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
inject_undef64(vcpu);
}
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
static bool kvm_serror_target_is_el2(struct kvm_vcpu *vcpu)
{
vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
*vcpu_hcr(vcpu) |= HCR_VSE;
return is_hyp_ctxt(vcpu) || vcpu_el2_amo_is_set(vcpu);
}
/**
* kvm_inject_vabt - inject an async abort / SError into the guest
* @vcpu: The VCPU to receive the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
*
* Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
* the remaining ISS all-zeros so that this error is not interpreted as an
* uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
* value, so the CPU generates an imp-def value.
*/
void kvm_inject_vabt(struct kvm_vcpu *vcpu)
static bool kvm_serror_undeliverable_at_el2(struct kvm_vcpu *vcpu)
{
kvm_set_sei_esr(vcpu, ESR_ELx_ISV);
return !(vcpu_el2_tge_is_set(vcpu) || vcpu_el2_amo_is_set(vcpu));
}
int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr)
{
lockdep_assert_held(&vcpu->mutex);
if (is_nested_ctxt(vcpu) && kvm_serror_target_is_el2(vcpu))
return kvm_inject_nested_serror(vcpu, esr);
if (vcpu_is_el2(vcpu) && kvm_serror_undeliverable_at_el2(vcpu)) {
vcpu_set_vsesr(vcpu, esr);
vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
return 1;
}
vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
*vcpu_hcr(vcpu) |= HCR_VSE;
return 1;
}

View File

@ -1808,7 +1808,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* There is no need to pass the error into the guest.
*/
if (kvm_handle_guest_sea())
kvm_inject_vabt(vcpu);
return kvm_inject_serror(vcpu);
return 1;
}

View File

@ -1782,3 +1782,43 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
kvm_inject_nested_irq(vcpu);
}
/*
* One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
* can write to HCR_EL2 behind our back, potentially changing the exception
* routing / masking for even the host context.
*
* What follows is some slop to (1) react to exception routing / masking and (2)
* preserve the pending SError state across translation regimes.
*/
void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
{
if (!vcpu_has_nv(vcpu))
return;
if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
}
void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
{
unsigned long *hcr = vcpu_hcr(vcpu);
if (!vcpu_has_nv(vcpu))
return;
/*
* We previously decided that an SError was deliverable to the guest.
* Reap the pending state from HCR_EL2 and...
*/
if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
/*
* Re-attempt SError injection in case the deliverability has changed,
* which is necessary to faithfully emulate WFI the case of a pending
* SError being a wakeup condition.
*/
if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
}