From b850841a53c56665c1f623edd429b3fc1578e9a4 Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Fri, 26 Sep 2025 15:51:39 +0200 Subject: [PATCH 001/260] KVM: x86/mmu: Skip MMIO SPTE invalidation if enable_mmio_caching=0 If MMIO caching is disabled, there are no MMIO SPTEs to invalidate, so the costly zapping of all pages is unnecessary even in the unlikely case when the MMIO generation number has wrapped. Signed-off-by: Dmytro Maluka Link: https://lore.kernel.org/r/20250926135139.1597781-1-dmaluka@chromium.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5..18d69d48bc55 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7364,6 +7364,9 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + if (!enable_mmio_caching) + return; + gen &= MMIO_SPTE_GEN_MASK; /* From 0152e049bd764942c41bc79b339ba6281c15dee9 Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Fri, 26 Sep 2025 17:57:24 +0200 Subject: [PATCH 002/260] KVM: VMX: Remove stale vmx_set_dr6() declaration Remove leftover after commit 80c64c7afea1 ("KVM: x86: Drop kvm_x86_ops.set_dr6() in favor of a new KVM_RUN flag") which removed vmx_set_dr6(). Signed-off-by: Dmytro Maluka Link: https://lore.kernel.org/r/20250926155724.1619716-1-dmaluka@chromium.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/x86_ops.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9697368d65b3..77613a44cebf 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,7 +73,6 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); -void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); From f505c7b16fbeea25f82a7e8d578e128178dd6706 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 24 Sep 2025 07:54:21 -0700 Subject: [PATCH 003/260] KVM: nVMX: Use vcpu instead of vmx->vcpu when vcpu is available Prefer using vcpu directly when available, instead of accessing it through vmx->vcpu. Signed-off-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250924145421.2046822-1-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..3fca63a261f5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -761,7 +761,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -780,7 +780,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -2749,7 +2749,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( @@ -3880,7 +3880,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + vcpu->arch.l1tf_flush_l1d = true; /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will From 9259607ec7100118cc5c608d97c9d406501e861e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:11:39 +0200 Subject: [PATCH 004/260] KVM: Explicitly allocate/setup irqfd cleanup as per-CPU workqueue Explicitly request the use of per-CPU queues for the irqfd cleanup workqueue in preparation for changing the default behavior of alloc_workqueue() from per-CPU to unbound, which will in turn allow for the removal of WQ_UNBOUND. See commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") for details. No functional change intended. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905091139.110677-2-marco.crivellari@suse.com [sean: rewrite changelog to tailor it to the KVM] Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a7794ffdb976..0e8b5277be3b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -707,7 +707,7 @@ bool kvm_notify_irqfd_resampler(struct kvm *kvm, */ int kvm_irqfd_init(void) { - irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; From 0bd0a4a1428baaf4447e95f0832492d9e3d64961 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Sep 2025 23:31:29 +0200 Subject: [PATCH 005/260] KVM: TDX: Replace kmalloc + copy_from_user with memdup_user in tdx_td_init() Use get_user() to retrieve the number of entries instead of allocating memory for 'init_vm' with the maximum size, copying 'cmd->data' to it, only to then read the actual entry count 'cpuid.nent' from the copy. Use memdup_user() to allocate just enough memory to fit all entries and to copy 'cmd->data' from userspace. Use struct_size() instead of manually calculating the number of bytes to allocate and copy. No functional changes intended. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250916213129.2535597-2-thorsten.blum@linux.dev [sean: s/user_init_vm/user_data] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..326db9b9c567 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2749,9 +2749,11 @@ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { + struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_init_vm *init_vm; struct td_params *td_params = NULL; + u32 nr_user_entries; int ret; BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); @@ -2763,28 +2765,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; - init_vm = kmalloc(sizeof(*init_vm) + - sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, - GFP_KERNEL); - if (!init_vm) - return -ENOMEM; + if (get_user(nr_user_entries, &user_data->cpuid.nent)) + return -EFAULT; - if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { - ret = -EFAULT; - goto out; - } + if (nr_user_entries > KVM_MAX_CPUID_ENTRIES) + return -E2BIG; - if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { - ret = -E2BIG; - goto out; - } - - if (copy_from_user(init_vm->cpuid.entries, - u64_to_user_ptr(cmd->data) + sizeof(*init_vm), - flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { - ret = -EFAULT; - goto out; - } + init_vm = memdup_user(user_data, + struct_size(user_data, cpuid.entries, nr_user_entries)); + if (IS_ERR(init_vm)) + return PTR_ERR(init_vm); if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { ret = -EINVAL; From 93c9e107386dbe1243287a5b14ceca894de372b9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 22 Sep 2025 09:29:22 -0700 Subject: [PATCH 006/260] KVM: SVM: Mark VMCB_PERM_MAP as dirty on nested VMRUN Mark the VMCB_PERM_MAP bit as dirty in nested_vmcb02_prepare_control() on every nested VMRUN. If L1 changes MSR interception (INTERCEPT_MSR_PROT) between two VMRUN instructions on the same L1 vCPU, the msrpm_base_pa in the associated vmcb02 will change, and the VMCB_PERM_MAP clean bit should be cleared. Fixes: 4bb170a5430b ("KVM: nSVM: do not mark all VMCB02 fields dirty on nested vmexit") Reported-by: Matteo Rizzo Cc: stable@vger.kernel.org Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250922162935.621409-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a6443feab252..35cea27862c6 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -752,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); /* * Stash vmcb02's counter if the guest hasn't moved past the guilty From 7c8b465a1c91f674655ea9cec5083744ec5f796a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 22 Sep 2025 09:29:23 -0700 Subject: [PATCH 007/260] KVM: SVM: Mark VMCB_NPT as dirty on nested VMRUN Mark the VMCB_NPT bit as dirty in nested_vmcb02_prepare_save() on every nested VMRUN. If L1 changes the PAT MSR between two VMRUN instructions on the same L1 vCPU, the g_pat field in the associated vmcb02 will change, and the VMCB_NPT clean bit should be cleared. Fixes: 4bb170a5430b ("KVM: nSVM: do not mark all VMCB02 fields dirty on nested vmexit") Cc: stable@vger.kernel.org Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250922162935.621409-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35cea27862c6..83de3456df70 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); + vmcb_mark_dirty(vmcb02, VMCB_NPT); /* Load the nested guest state */ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { From 04fd067b770d19fee39759d994c4bfa2fb332d9f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 14 Oct 2025 16:28:02 +0100 Subject: [PATCH 008/260] KVM: Fix VM exit code for full dirty ring in API documentation While reading the documentation, I saw a exit code I could not grep for, to figure out it has a slightly different name. Fix that name in documentation so it points to the right exit code. Signed-off-by: Leonardo Bras Link: https://lore.kernel.org/r/20251014152802.13563-1-leo.bras@arm.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6ae24c5ca559..3382adefc772 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8510,7 +8510,7 @@ Therefore, the ioctl must be called *before* reading the content of the dirty pages. The dirty ring can get full. When it happens, the KVM_RUN of the -vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL. +vcpu will return with exit reason KVM_EXIT_DIRTY_RING_FULL. The dirty ring interface has a major difference comparing to the KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from From 4793f990ea1523309a58d8fb5237b3a815e6f537 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Sep 2025 17:14:07 -0700 Subject: [PATCH 009/260] KVM: x86: Advertise EferLmsleUnsupported to userspace CPUID.80000008H:EBX.EferLmsleUnsupported[bit 20] is a defeature bit. When this bit is clear, EFER.LMSLE is supported. When this bit is set, EFER.LMLSE is unsupported. KVM has never _emulated_ EFER.LMSLE, so KVM cannot truly support a 0-setting of this bit. However, KVM has allowed the guest to enable EFER.LMSLE in hardware since commit eec4b140c924 ("KVM: SVM: Allow EFER.LMSLE to be set with nested svm"), i.e. KVM partially virtualizes long-mode segment limits _if_ they are supported by the underlying hardware. Pass through the bit in KVM_GET_SUPPORTED_CPUID to advertise the unavailability of EFER.LMSLE to userspace based on the raw underlying hardware. Attempting to enable EFER.LSMLE on such CPUs simply doesn't work, e.g. immediately crashes on VMRUN. Signed-off-by: Jim Mattson Reviewed-by: Nikunj A Dadhania Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20251001001529.1119031-2-jmattson@google.com [sean: add context about partial virtualization, use PASSTHROUGH_F] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..6bdf868c8f8e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -338,6 +338,7 @@ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ #define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ +#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..d563a948318b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void) F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), F(AMD_IBRS_SAME_MODE), + PASSTHROUGH_F(EFER_LMSLE_MBZ), F(AMD_PSFD), F(AMD_IBPB_RET), ); From c53c632592a427bc01266a8ce7e2f17555a3c247 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Sep 2025 17:14:08 -0700 Subject: [PATCH 010/260] KVM: SVM: Disallow EFER.LMSLE when not supported by hardware Modern AMD CPUs do not support segment limit checks in 64-bit mode (i.e. EFER.LMSLE must be zero). Do not allow a guest to set EFER.LMSLE on a CPU that requires the bit to be zero. For backwards compatibility, allow EFER.LMSLE to be set on CPUs that support segment limit checks in 64-bit mode, even though KVM's implementation of the feature is incomplete (e.g. KVM's emulator does not enforce segment limits in 64-bit mode). Fixes: eec4b140c924 ("KVM: SVM: Allow EFER.LMSLE to be set with nested svm") Signed-off-by: Jim Mattson Reviewed-by: Nikunj A Dadhania Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20251001001529.1119031-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..dadb562bd4b9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5320,7 +5320,9 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + kvm_enable_efer_bits(EFER_SVME); + if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ)) + kvm_enable_efer_bits(EFER_LMSLE); r = nested_svm_init_msrpm_merge_offsets(); if (r) From f48888bb8ad109c772c9dcdfabbf749ab5ac5502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:47 -0700 Subject: [PATCH 011/260] KVM: VMX: Hoist construct_eptp() "up" in vmx.c Move construct_eptp() further up in vmx.c so that it's above vmx_flush_tlb_current(), its "first" user in vmx.c. This will allow a future patch to opportunistically make construct_eptp() local to vmx.c. No functional change intended. Link: https://lore.kernel.org/r/20250919005955.1366256-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..98506c723291 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,6 +3219,20 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +{ + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + eptp |= root_hpa; + + return eptp; +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3396,20 +3410,6 @@ static int vmx_get_max_ept_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; - - return eptp; -} - void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; From a8749281e4c63e582574ae4409be7641763e58ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:48 -0700 Subject: [PATCH 012/260] KVM: nVMX: Hardcode dummy EPTP used for early nested consistency checks Hardcode the dummy EPTP used for "early" consistency checks as there's no need to use 5-level EPT based on the guest.MAXPHYADDR (the EPTP just needs to be valid, it's never truly consumed). This will allow breaking construct_eptp()'s dependency on having access to the vCPU, which in turn will (much further in the future) allow for eliding per-root TLB flushes when a vCPU is migrated between pCPUs (a flush is need if and only if that particular pCPU hasn't already flushed the vCPU's roots). Link: https://lore.kernel.org/r/20250919005955.1366256-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 8 +++----- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3fca63a261f5..0a4b4e790f9f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2297,13 +2297,11 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmx->nested.vmcs02_initialized = true; /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. + * If early consistency checks are enabled, stuff the EPT Pointer with + * a dummy *legal* value to avoid false positives on bad control state. */ if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, - construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + vmcs_write64(EPT_POINTER, VMX_EPTP_MT_WB | VMX_EPTP_PWL_4); if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 98506c723291..6a1377015a2a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,7 +3219,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +static u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ea93121029f9..6cb04a6afeef 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -369,7 +369,6 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); From a10f5cc3ac9b05c764e87ae13de9a716ff519903 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:49 -0700 Subject: [PATCH 013/260] KVM: x86/mmu: Move "dummy root" helpers to spte.h Move the helpers to get/query a dummy root from mmu_internal.h to spte.h so that VMX can detect and handle dummy roots when constructing EPTPs. This will allow using the root's role to build the EPTP instead of pulling equivalent information out of the vCPU structure. No functional change intended. Link: https://lore.kernel.org/r/20250919005955.1366256-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu_internal.h | 10 ---------- arch/x86/kvm/mmu/spte.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ed5c01df21ba..73cdcbccc89e 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -39,16 +39,6 @@ #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) -static inline hpa_t kvm_mmu_get_dummy_root(void) -{ - return my_zero_pfn(0) << PAGE_SHIFT; -} - -static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) -{ - return is_zero_pfn(shadow_page >> PAGE_SHIFT); -} - typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 3133f066927e..91ce29fd6f1b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); From 2f723a86342355fee85574352a165e8bf6fa5372 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:50 -0700 Subject: [PATCH 014/260] KVM: VMX: Use kvm_mmu_page role to construct EPTP, not current vCPU state Use the role for the to-be-loaded/invalidated EPT root to compute the root's level and A/D enablement instead of pulling the information from the vCPU (e.g. by passing in the root level and querying vmcs12). Not making unnecessary assumptions about the root will allow invalidating arbitrary EPT roots (which sadly requires a full EPTP) at any given time. No functional change intended (the end result should be the same). Link: https://lore.kernel.org/r/20250919005955.1366256-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a1377015a2a..1021d3b65ea0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3219,20 +3219,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -static u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +static u64 construct_eptp(hpa_t root_hpa) { - u64 eptp = VMX_EPTP_MT_WB; + u64 eptp = root_hpa | VMX_EPTP_MT_WB; + struct kvm_mmu_page *root; - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + if (kvm_mmu_is_dummy_root(root_hpa)) + return eptp | VMX_EPTP_PWL_4; - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + /* + * EPT roots should always have an associated MMU page. Return a "bad" + * EPTP to induce VM-Fail instead of continuing on in a unknown state. + */ + root = root_to_sp(root_hpa); + if (WARN_ON_ONCE(!root)) + return INVALID_PAGE; + + eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && !root->role.ad_disabled) eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; return eptp; } +static void vmx_flush_tlb_ept_root(hpa_t root_hpa) +{ + u64 eptp = construct_eptp(root_hpa); + + if (VALID_PAGE(eptp)) + ept_sync_context(eptp); + else + ept_sync_global(); +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3243,8 +3263,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->root_role.level)); + vmx_flush_tlb_ept_root(root_hpa); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -3415,11 +3434,11 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; - u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, root_hpa, root_level); - vmcs_write64(EPT_POINTER, eptp); + KVM_MMU_WARN_ON(root_to_sp(root_hpa) && + root_level != root_to_sp(root_hpa)->role.level); + vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); hv_track_root_tdp(vcpu, root_hpa); From 15fe455dd1a011bbc8f9e512c6dc324cfca028c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:51 -0700 Subject: [PATCH 015/260] KVM: nVMX: Add consistency check for TPR_THRESHOLD[31:4]!=0 without VID Add a missing consistency check on the TPR Threshold. Per the SDM If the "use TPR shadow" VM-execution control is 1 and the "virtual- interrupt delivery" VM-execution control is 0, bits 31:4 of the TPR threshold VM-execution control field must be 0. Note, nested_vmx_check_tpr_shadow_controls() bails early if "use TPR shadow" is 0. Link: https://lore.kernel.org/r/20250919005955.1366256-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0a4b4e790f9f..ffd2628b9c1e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; + if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) + return -EINVAL; + return 0; } From ae8e6ad84177456d8810a8ff3a5cf3f477fb0721 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:52 -0700 Subject: [PATCH 016/260] KVM: nVMX: Add consistency check for TSC_MULTIPLIER=0 Add a missing consistency check on the TSC Multiplier being '0'. Per the SDM: If the "use TSC scaling" VM-execution control is 1, the TSC-multiplier must not be zero. Fixes: d041b5ea9335 ("KVM: nVMX: Enable nested TSC scaling") Link: https://lore.kernel.org/r/20250919005955.1366256-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ffd2628b9c1e..77b5f75cc2bb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2962,6 +2962,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, } } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && + CC(!vmcs12->tsc_multiplier)) + return -EINVAL; + return 0; } From f91699d5692ddd0ee92b9487014fc477179ab3a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:53 -0700 Subject: [PATCH 017/260] KVM: nVMX: Stuff vmcs02.TSC_MULTIPLIER early on for nested early checks If KVM is doing "early" nested VM-Enter consistency checks and TSC scaling is supported, stuff vmcs02's TSC Multiplier early on to avoid getting a false positive VM-Fail due to trying to do VM-Enter with TSC_MULTIPLIER=0. To minimize complexity around L1 vs. L2 TSC, KVM sets the actual TSC Multiplier rather late during VM-Entry, i.e. may have '0' at the time of early consistency checks. If vmcs12 has TSC Scaling enabled, use the multiplier from vmcs12 so that nested early checks actually check vmcs12 state, otherwise throw in an arbitrary value of '1' (anything non-zero is legal). Fixes: d041b5ea9335 ("KVM: nVMX: Enable nested TSC scaling") Link: https://lore.kernel.org/r/20250919005955.1366256-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 77b5f75cc2bb..efca276c67f0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2371,6 +2371,13 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } + + if (kvm_caps.has_tsc_control && nested_early_check) { + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vmcs_write64(TSC_MULTIPLIER, vmcs12->tsc_multiplier); + else + vmcs_write64(TSC_MULTIPLIER, 1); + } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, From a175da6d430ef7f8e24153e44c59ab6903e20f97 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:54 -0700 Subject: [PATCH 018/260] KVM: nVMX: Remove support for "early" consistency checks via hardware Remove nested_early_check and all associated code, as it's quite obviously not being used or tested (it's been broken for 4+ years without a single bug report). More importantly, KVM's software-based consistency checks have matured since the option to do hardware-based checks was added; KVM appears to be missing only _one_ consistency check, on vTPR. And even *more* importantly, that consistency check can't be prevented by an early hardware check due to L1 being able to modify the virtual APIC at any time, i.e. there's an inherent TOCTOU flaw that could cause KVM to "miss" a consistency check VM-Fail, regardless of whether the check is performed by software or by hardware. In other words, KVM _must_ be able to unwind from a late VM-Fail (which was a big motivation for doing early checks). I.e. now that KVM provides (almost) all necessary consistency checks, what's really needed is a way to detect missing checks in KVM, not a way to avoid having to unwind from a late VM-Fail. And that can be done much more simply, e.g. by an simple module param to guard a WARN (which, sadly, must be off-by-default to avoid splats due to the aforementioned TOCTOU issue). For all intents and purposes, this reverts commit 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W"). Link: https://lore.kernel.org/r/20250919005955.1366256-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 130 ++++---------------------------------- 1 file changed, 12 insertions(+), 118 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index efca276c67f0..24a2d0fa1660 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -23,9 +23,6 @@ static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* @@ -2299,13 +2296,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) return; vmx->nested.vmcs02_initialized = true; - /* - * If early consistency checks are enabled, stuff the EPT Pointer with - * a dummy *legal* value to avoid false positives on bad control state. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, VMX_EPTP_MT_WB | VMX_EPTP_PWL_4); - if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); @@ -2371,13 +2361,6 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } - - if (kvm_caps.has_tsc_control && nested_early_check) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) - vmcs_write64(TSC_MULTIPLIER, vmcs12->tsc_multiplier); - else - vmcs_write64(TSC_MULTIPLIER, 1); - } } static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, @@ -3345,84 +3328,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - bool vm_fail; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - __vmx_vcpu_run_flags(vmx)); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vm_fail) { - u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); - - preempt_enable(); - - trace_kvm_nested_vmenter_failed( - "early hardware check VM-instruction error: ", error); - WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - local_irq_enable(); - preempt_enable(); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); - - return 0; -} - #ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { @@ -3679,22 +3584,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, &vmx->nested.pre_vmenter_ssp_tbl); /* - * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* - * nested early checks are disabled. In the event of a "late" VM-Fail, - * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its - * software model to the pre-VMEntry host state. When EPT is disabled, - * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes - * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing - * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to - * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested - * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is - * guaranteed to be overwritten with a shadow CR3 prior to re-entering - * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as - * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks - * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail - * path would need to manually save/restore vmcs01.GUEST_CR3. + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the + * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but + * not KVM, KVM must unwind its software model to the pre-VM-Entry host + * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not + * L1's "real" CR3, which causes nested_vmx_restore_host_state() to + * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the + * unwind naturally setting arch.cr3 to the correct value. Smashing + * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, + * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be + * overwritten with a shadow CR3 prior to re-entering L1. */ - if (!enable_ept && !nested_early_check) + if (!enable_ept) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); @@ -3707,11 +3608,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } - if (nested_vmx_check_vmentry_hw(vcpu)) { - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return NVMX_VMENTRY_VMFAIL; - } - if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -5176,12 +5072,10 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. + * problem with L0. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); } /* From 1100e4910ad207bc00aedc8dfdb228dd1b81f310 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:59:55 -0700 Subject: [PATCH 019/260] KVM: nVMX: Add an off-by-default module param to WARN on missed consistency checks Add an off-by-default param, "warn_on_missed_cc", to have KVM WARN on a missed VMX Consistency Check on nested VM-Enter, specifically so that KVM developers and maintainers can more easily detect missing checks. KVM's goal/intent is that KVM detect *all* VM-Fail conditions in software, as relying on hardware leads to false passes when KVM's nested support is a subset of hardware support, e.g. see commit 095686e6fcb4 ("KVM: nVMX: Check vmcs12->guest_ia32_debugctl on nested VM-Enter"). With one notable exception, KVM now detects all VM-Fail scenarios for which there is known test coverage, i.e. KVM developers can enable the param and expect a clean run, and thus can use the param to detect missed checks, e.g. when enabling new features, when writing new tests, etc. The one exception is an unfortunate consistency check on vTPR. Because the vTPR for L2 comes from the virtual APIC page provided by L1, L2's vTPR is fully writable at all times, i.e. is inherently subject to TOCTOU issues with respect to checks in software versus consumption in hardware. Further complicating matters is KVM's deferred handling of vmcs12 pages when loading nested state; KVM flat out cannot check vTPR during KVM_SET_NESTED_STATE without breaking setups that do on-demand paging, e.g. for live migration and/or live update. To fudge around the vTPR issue, add a "late" controls check for vTPR and also treat an invalid virtual APIC as VM-Fail, but gate the check on warn_on_missed_cc being enabled to avoid unwanted false positives, i.e. to avoid breaking KVM in production. Cc: Jim Mattson Link: https://lore.kernel.org/r/20250919005955.1366256-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 24a2d0fa1660..b0cd745518b4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -23,6 +23,9 @@ static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +static bool __ro_after_init warn_on_missed_cc; +module_param(warn_on_missed_cc, bool, 0444); + #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* @@ -3073,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; + u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; + + /* + * Don't bother with the consistency checks if KVM isn't configured to + * WARN on missed consistency checks, as KVM needs to rely on hardware + * to fully detect an illegal vTPR vs. TRP Threshold combination due to + * the vTPR being writable by L1 at all times (it's an in-memory value, + * not a VMCS field). I.e. even if the check passes now, it might fail + * at the actual VM-Enter. + * + * Keying off the module param also allows treating an invalid vAPIC + * mapping as a consistency check failure without increasing the risk + * of breaking a "real" VM. + */ + if (!warn_on_missed_cc) + return 0; + + if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + (CC(!vapic) || + CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3608,6 +3643,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } + if (nested_vmx_check_controls_late(vcpu, vmcs12)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return NVMX_VMENTRY_VMFAIL; + } + if (nested_vmx_check_guest_state(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -5076,6 +5116,9 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* VM-Fail at VM-Entry means KVM missed a consistency check. */ + WARN_ON_ONCE(warn_on_missed_cc); } /* From 574ef752d4aea04134bc121294d717f4422c2755 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:01 +0530 Subject: [PATCH 020/260] KVM: SVM: Limit AVIC physical max index based on configured max_vcpu_ids KVM allows VMMs to specify the maximum possible APIC ID for a virtual machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data structures related to APIC/x2APIC. Utilize the same to set the AVIC physical max index in the VMCB, similar to VMX. This helps hardware limit the number of entries to be scanned in the physical APIC ID table speeding up IPI broadcasts for virtual machines with smaller number of vCPUs. Unlike VMX, SVM AVIC requires a single page to be allocated for the Physical APIC ID table and the Logical APIC ID table, so retain the existing approach of allocating those during VM init. Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/adb07ccdb3394cd79cb372ba6bcc69a4e4d4ef54.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f286b5706d7c..1a0573f5ff7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -161,6 +161,7 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; + struct kvm *kvm = svm->vcpu.kvm; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; @@ -176,7 +177,8 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, + X2AVIC_MAX_PHYSICAL_ID); /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -187,7 +189,8 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, + AVIC_MAX_PHYSICAL_ID); /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } From f2f6e67a56dc88fea7e9b10c4e79bb01d97386b7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:02 +0530 Subject: [PATCH 021/260] KVM: SVM: Add a helper to look up the max physical ID for AVIC To help with a future change, add a helper to look up the maximum physical ID depending on the vCPU AVIC mode. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/0ab9bf5e20a3463a4aa3a5ea9bbbac66beedf1d1.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1a0573f5ff7c..96bad58ee3b4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -158,13 +158,31 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + u32 arch_max; + + if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) + arch_max = X2AVIC_MAX_PHYSICAL_ID; + else + arch_max = AVIC_MAX_PHYSICAL_ID; + + /* + * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID + * plus one, so the max possible APIC ID is one less than that. + */ + return min(vcpu->kvm->arch.max_vcpu_ids - 1, arch_max); +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; - struct kvm *kvm = svm->vcpu.kvm; + struct kvm_vcpu *vcpu = &svm->vcpu; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; @@ -177,8 +195,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, - X2AVIC_MAX_PHYSICAL_ID); + /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -188,9 +205,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, - AVIC_MAX_PHYSICAL_ID); /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } From 83f3cbcd3a9f62adfa52257e3b7ae6bf8af54baa Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:03 +0530 Subject: [PATCH 022/260] KVM: SVM: Replace hard-coded value 0x1FF with the corresponding macro The lower 9-bit field in EXITINFO2 represents an index into the AVIC Physical/Logical APIC ID table for a AVIC_INCOMPLETE_IPI #VMEXIT. Since the index into the Logical APIC ID table is just 8 bits, this field is actually bound by the bit-width of the index into the AVIC Physical ID table which is represented by AVIC_PHYSICAL_MAX_INDEX_MASK. So, use that macro to mask EXITINFO2.Index instead of hard coding 0x1FF in avic_incomplete_ipi_interception(). Co-developed-by: Suravee Suthikulpanit Signed-off-by: Suravee Suthikulpanit Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/95795f449c68bffcb3e1789ee2b0b7393711d37d.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 96bad58ee3b4..b31be39a118f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -579,7 +579,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; + u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); From ca11d9d35e958d2b4020d1a360ddc277be3ee86c Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:04 +0530 Subject: [PATCH 023/260] KVM: SVM: Expand AVIC_PHYSICAL_MAX_INDEX_MASK to be a 12-bit field In the latest APM describing AVIC support for 4k vCPUs, VMCB AVIC_PHYSICAL_MAX_INDEX (Offset 0xF8) and EXITINFO2.Index are both updated from 9-bit wide to 12-bit wide fields unconditionally (i.e., regardless of AVIC support for 4k vCPUs). Expand AVIC_PHYSICAL_MAX_INDEX_MASK accordingly. While AVIC_PHYSICAL_MAX_INDEX_MASK is updated to a 12-bit field, KVM will limit the max vCPU/APIC ID based on the maximum supported on a specific processor and enforce that limit during vCPU creation. I.e., KVM doesn't need to rely on the mask to ensure that the max APIC ID being programmed in the VMCB is in range. The additional bits (11:9) were previously marked reserved and were never set/read by older processors. Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/a24ae953cea716bf9c56c136f7ca4bf5e97b1080.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 17f6c3fedeee..d227e710c6b4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -279,7 +279,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; -#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0) /* * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as From 54ffe74cc4ab2e7c6dd0a37a2298fffb642acba7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:05 +0530 Subject: [PATCH 024/260] KVM: SVM: Move AVIC Physical ID table allocation to vcpu_precreate() With support for 4k vCPUs in x2AVIC, the size of the AVIC Physical ID table is expanded from a single 4k page to a maximum of 8 contiguous 4k pages. The actual number of pages allocated depends on the maximum possible APIC ID in the guest, which is only known by the time the first vCPU is created. In preparation for supporting a dynamic AVIC Physical ID table size, move its allocation to vcpu_precreate(). Suggested-by: Sean Christopherson Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/7dc764e0af7f01440bbac3d9215ed174027c2384.1757009416.git.naveen@kernel.org [sean: drop enable_apicv check from svm_vcpu_precreate()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 21 +++++++++++++++++---- arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b31be39a118f..9866ef73501e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -264,6 +264,23 @@ int avic_ga_log_notifier(u32 ga_tag) return 0; } +int avic_alloc_physical_id_table(struct kvm *kvm) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_apicv) + return 0; + + if (kvm_svm->avic_physical_id_table) + return 0; + + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_physical_id_table) + return -ENOMEM; + + return 0; +} + void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -291,10 +308,6 @@ int avic_vm_init(struct kvm *kvm) if (!enable_apicv) return 0; - kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!kvm_svm->avic_physical_id_table) - goto free_avic; - kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!kvm_svm->avic_logical_id_table) goto free_avic; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dadb562bd4b9..f14709a511aa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1207,6 +1207,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } +static int svm_vcpu_precreate(struct kvm *kvm) +{ + return avic_alloc_physical_id_table(kvm); +} + static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; @@ -5016,6 +5021,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, + .vcpu_precreate = svm_vcpu_precreate, .vcpu_create = svm_vcpu_create, .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4b04f435b3d..6765a5e433ce 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -806,6 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; bool __init avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); +int avic_alloc_physical_id_table(struct kvm *kvm); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); From 5d0316e25defee47c8c7b1b6324244f630b6621f Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:06 +0530 Subject: [PATCH 025/260] x86/cpufeatures: Add X86_FEATURE_X2AVIC_EXT Add CPUID feature bit for x2AVIC extension that enables AMD SVM to support up to 4096 vCPUs in x2AVIC mode. The primary change is in the size of the AVIC Physical ID table, which can now go up to 8 contiguous 4k pages. The number of pages allocated is controlled by the maximum APIC ID for a guest, and that controls the number of pages to allocate for the AVIC Physical ID table. AVIC hardware is enhanced to look up Physical ID table entries for vCPUs > 512 for locating the target APIC backing page and the host APIC ID of the physical core on which the guest vCPU is running. Signed-off-by: Naveen N Rao (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/e5c9c471ab99a130bf9b728b77050ab308cf8624.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6bdf868c8f8e..7129eb44adad 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -500,6 +500,7 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_X2AVIC_EXT (21*32+17) /* AMD SVM x2AVIC support for 4k vCPUs */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c21..aa7f21f5f46b 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -49,6 +49,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, From 940fc47cfb0d78b0f5db1f71dfce07c4711b9457 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:07 +0530 Subject: [PATCH 026/260] KVM: SVM: Add AVIC support for 4k vCPUs in x2AVIC mode With AVIC support for 4k vCPUs, the maximum supported physical ID in x2AVIC mode is 4095. Since this is no longer fixed, introduce a variable (x2avic_max_physical_id) to capture the maximum supported physical ID on the current platform and use that in place of the existing macro (X2AVIC_MAX_PHYSICAL_ID). With AVIC support for 4k vCPUs, the AVIC Physical ID table is no longer a single page and can occupy up to 8 contiguous 4k pages. Since AVIC hardware accesses of the physical ID table are limited by the physical max index programmed in the VMCB, it is sufficient to allocate only as many pages as are required to have a physical table entry for the max guest APIC ID. Since the guest APIC mode is not available at this point, provision for the maximum possible x2AVIC ID. For this purpose, add a variant of avic_get_max_physical_id() that works with a NULL vCPU pointer and returns the max x2AVIC ID. Wrap this in a new helper for obtaining the allocation order. To make it easy to identify support for 4k vCPUs in x2AVIC mode, update the message printed to the kernel log to print the maximum number of vCPUs supported. Do this on all platforms supporting x2AVIC since it is useful to know what is supported on a specific platform. Co-developed-by: Suravee Suthikulpanit Signed-off-by: Suravee Suthikulpanit Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/7fc5962f6da028f7dd3c79dbbd5c574fa02c99dd.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +++ arch/x86/kvm/svm/avic.c | 50 ++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d227e710c6b4..e69b6d0dedcf 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -289,11 +289,14 @@ enum avic_ipi_failure_cause { /* * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511). + * With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095). */ #define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL +#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); +static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID); #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 9866ef73501e..9c17f4269a71 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -106,7 +106,7 @@ static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); static bool x2avic_enabled; - +static u32 x2avic_max_physical_id; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) @@ -158,12 +158,16 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } -static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) { u32 arch_max; - if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) - arch_max = X2AVIC_MAX_PHYSICAL_ID; + /* + * Return the largest size (x2APIC) when querying without a vCPU, e.g. + * to allocate the per-VM table.. + */ + if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) + arch_max = x2avic_max_physical_id; else arch_max = AVIC_MAX_PHYSICAL_ID; @@ -171,7 +175,12 @@ static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID * plus one, so the max possible APIC ID is one less than that. */ - return min(vcpu->kvm->arch.max_vcpu_ids - 1, arch_max); + return min(kvm->arch.max_vcpu_ids - 1, arch_max); +} + +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + return __avic_get_max_physical_id(vcpu->kvm, vcpu); } static void avic_activate_vmcb(struct vcpu_svm *svm) @@ -264,6 +273,12 @@ int avic_ga_log_notifier(u32 ga_tag) return 0; } +static int avic_get_physical_id_table_order(struct kvm *kvm) +{ + /* Provision for the maximum physical ID supported in x2avic mode */ + return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); +} + int avic_alloc_physical_id_table(struct kvm *kvm) { struct kvm_svm *kvm_svm = to_kvm_svm(kvm); @@ -274,7 +289,8 @@ int avic_alloc_physical_id_table(struct kvm *kvm) if (kvm_svm->avic_physical_id_table) return 0; - kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + avic_get_physical_id_table_order(kvm)); if (!kvm_svm->avic_physical_id_table) return -ENOMEM; @@ -290,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm) return; free_page((unsigned long)kvm_svm->avic_logical_id_table); - free_page((unsigned long)kvm_svm->avic_physical_id_table); + free_pages((unsigned long)kvm_svm->avic_physical_id_table, + avic_get_physical_id_table_order(kvm)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -372,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) * fully initialized AVIC. */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) { + (id > x2avic_max_physical_id)) { kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); vcpu->arch.apic->apicv_active = false; return 0; @@ -992,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1054,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) lockdep_assert_preemption_disabled(); - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1256,10 +1275,15 @@ bool __init avic_hardware_setup(void) /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); - if (x2avic_enabled) - pr_info("x2AVIC enabled\n"); - else + if (x2avic_enabled) { + if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) + x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; + else + x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; + pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); + } else { svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; + } /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) From e72753ed12670bdf599d5a07066c861c62d40ae8 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Thu, 17 Apr 2025 14:43:59 +0200 Subject: [PATCH 027/260] KVM: s390: Use ESCA instead of BSCA at VM init All modern IBM Z and Linux One machines do offer support for the Extended System Control Area (ESCA). The ESCA is available since the z114/z196 released in 2010. KVM needs to allocate and manage the SCA for guest VMs. Prior to this change the SCA was setup as Basic SCA only supporting a maximum of 64 vCPUs when initializing the VM. With addition of the 65th vCPU the SCA was needed to be converted to a ESCA. Instead of allocating a BSCA and upgrading it for PV or when adding the 65th cpu we can always allocate the ESCA directly upon VM creation simplifying the code in multiple places as well as completely removing the need to convert an existing SCA. In cases where the ESCA is not supported (z10 and earlier) the use of the SCA entries and with that SIGP interpretation are disabled for VMs. This increases the number of exits from the VM in multiprocessor scenarios and thus decreases performance. The same is true for VSIE where SIGP is currently disabled and thus no SCA entries are used. The only downside of the change is that we will always allocate 4 pages for a 248 cpu ESCA instead of a single page for the BSCA per VM. In return we can delete a bunch of checks and special handling depending on the SCA type as well as the whole BSCA to ESCA conversion. With that behavior change we are no longer referencing a bsca_block in kvm->arch.sca. This will always be esca_block instead. By specifying the type of the sca as esca_block we can simplify access to the sca and get rid of some helpers while making the code clearer. KVM_MAX_VCPUS is also moved to kvm_host_types to allow using this in future type definitions. Reviewed-by: Janosch Frank Signed-off-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 5 +- arch/s390/kvm/gaccess.c | 10 +- arch/s390/kvm/interrupt.c | 78 +++++--------- arch/s390/kvm/kvm-s390.c | 170 +++++++------------------------ arch/s390/kvm/kvm-s390.h | 9 +- 5 files changed, 67 insertions(+), 205 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c2ba3d4398c5..3cf14dd75409 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -631,9 +631,8 @@ struct kvm_s390_pv { struct mmu_notifier mmu_notifier; }; -struct kvm_arch{ - void *sca; - int use_esca; +struct kvm_arch { + struct esca_block *sca; rwlock_t sca_lock; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 21c2e61fece4..3651ab682fd7 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -113,7 +113,7 @@ int ipte_lock_held(struct kvm *kvm) int rc; read_lock(&kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + rc = kvm->arch.sca->ipte_control.kh != 0; read_unlock(&kvm->arch.sca_lock); return rc; } @@ -130,7 +130,7 @@ static void ipte_lock_simple(struct kvm *kvm) goto out; retry: read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { @@ -155,7 +155,7 @@ static void ipte_unlock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count) goto out; read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; @@ -173,7 +173,7 @@ static void ipte_lock_siif(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { @@ -193,7 +193,7 @@ static void ipte_unlock_siif(struct kvm *kvm) union ipte_control old, new, *ic; read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c62a868cf2b6..36394ba897f5 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -45,6 +45,8 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { + union esca_sigp_ctrl sigp_ctrl; + struct esca_block *sca; int c, scn; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) @@ -52,21 +54,11 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; + sca = vcpu->kvm->arch.sca; + sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } + c = sigp_ctrl.c; + scn = sigp_ctrl.scn; read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) @@ -77,37 +69,23 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { + union esca_sigp_ctrl old_val, new_val = {0}; + union esca_sigp_ctrl *sigp_ctrl; + struct esca_block *sca; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val; + sca = vcpu->kvm->arch.sca; + sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; + old_val = READ_ONCE(*sigp_ctrl); + new_val.scn = src_id; + new_val.c = 1; + old_val.c = 0; - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val; - - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; - - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); read_unlock(&vcpu->kvm->arch.sca_lock); if (rc != expect) { @@ -120,23 +98,17 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { + union esca_sigp_ctrl *sigp_ctrl; + struct esca_block *sca; + if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + sca = vcpu->kvm->arch.sca; + sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; - WRITE_ONCE(sigp_ctrl->value, 0); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - - WRITE_ONCE(sigp_ctrl->value, 0); - } + WRITE_ONCE(sigp_ctrl->value, 0); read_unlock(&vcpu->kvm->arch.sca_lock); } @@ -1224,7 +1196,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - if (!sclp.has_sigpif) + if (!kvm_s390_use_sca_entries()) return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); return sca_ext_call_pending(vcpu, NULL); @@ -1549,7 +1521,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) + if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..78468b96d250 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -272,7 +272,6 @@ debug_info_t *kvm_s390_dbf_uv; /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); -static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -632,11 +631,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: - r = KVM_S390_BSCA_CPU_SLOTS; + /* + * Return the same value for KVM_CAP_MAX_VCPUS and + * KVM_CAP_MAX_VCPU_ID to conform with the KVM API. + */ + r = KVM_S390_ESCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; - else if (sclp.has_esca && sclp.has_64bscao) - r = KVM_S390_ESCA_CPU_SLOTS; if (ext == KVM_CAP_NR_VCPUS) r = min_t(unsigned int, num_online_cpus(), r); break; @@ -1931,13 +1932,11 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) * Updates the Multiprocessor Topology-Change-Report bit to signal * the guest with a topology change. * This is only relevant if the topology facility is present. - * - * The SCA version, bsca or esca, doesn't matter as offset is the same. */ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) { union sca_utility new, old; - struct bsca_block *sca; + struct esca_block *sca; read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; @@ -1968,7 +1967,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, return -ENXIO; read_lock(&kvm->arch.sca_lock); - topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + topo = kvm->arch.sca->utility.mtcr; read_unlock(&kvm->arch.sca_lock); return put_user(topo, (u8 __user *)attr->addr); @@ -2667,14 +2666,6 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (kvm_s390_pv_is_protected(kvm)) break; - /* - * FMT 4 SIE needs esca. As we never switch back to bsca from - * esca, we need no cleanup in the error cases below - */ - r = sca_switch_to_extended(kvm); - if (r) - break; - mmap_write_lock(kvm->mm); r = gmap_helper_disable_cow_sharing(); mmap_write_unlock(kvm->mm); @@ -3317,10 +3308,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) static void sca_dispose(struct kvm *kvm) { - if (kvm->arch.use_esca) - free_pages_exact(kvm->arch.sca, sizeof(struct esca_block)); - else - free_page((unsigned long)(kvm->arch.sca)); + free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca)); kvm->arch.sca = NULL; } @@ -3334,10 +3322,9 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; - int i, rc; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO; char debug_name[16]; - static unsigned long sca_offset; + int i, rc; rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL @@ -3359,17 +3346,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; rwlock_init(&kvm->arch.sca_lock); - /* start with basic SCA */ - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + mutex_lock(&kvm_lock); + + kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); + mutex_unlock(&kvm_lock); if (!kvm->arch.sca) goto out_err; - mutex_lock(&kvm_lock); - sca_offset += 16; - if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) - sca_offset = 0; - kvm->arch.sca = (struct bsca_block *) - ((char *) kvm->arch.sca + sca_offset); - mutex_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); @@ -3548,27 +3530,25 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca; + if (!kvm_s390_use_sca_entries()) return; read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; + sca = vcpu->kvm->arch.sca; - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } + clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; read_unlock(&vcpu->kvm->arch.sca_lock); } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca; + phys_addr_t sca_phys; + if (!kvm_s390_use_sca_entries()) { - phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); + sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ vcpu->arch.sie_block->scaoh = sca_phys >> 32; @@ -3576,105 +3556,23 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu) return; } read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); + sca = vcpu->kvm->arch.sca; + sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - } + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); read_unlock(&vcpu->kvm->arch.sca_lock); } -/* Basic SCA to Extended SCA data copy routines */ -static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s) -{ - d->sda = s->sda; - d->sigp_ctrl.c = s->sigp_ctrl.c; - d->sigp_ctrl.scn = s->sigp_ctrl.scn; -} - -static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s) -{ - int i; - - d->ipte_control = s->ipte_control; - d->mcn[0] = s->mcn; - for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++) - sca_copy_entry(&d->cpu[i], &s->cpu[i]); -} - -static int sca_switch_to_extended(struct kvm *kvm) -{ - struct bsca_block *old_sca = kvm->arch.sca; - struct esca_block *new_sca; - struct kvm_vcpu *vcpu; - unsigned long vcpu_idx; - u32 scaol, scaoh; - phys_addr_t new_sca_phys; - - if (kvm->arch.use_esca) - return 0; - - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!new_sca) - return -ENOMEM; - - new_sca_phys = virt_to_phys(new_sca); - scaoh = new_sca_phys >> 32; - scaol = new_sca_phys & ESCA_SCAOL_MASK; - - kvm_s390_vcpu_block_all(kvm); - write_lock(&kvm->arch.sca_lock); - - sca_copy_b_to_e(new_sca, old_sca); - - kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { - vcpu->arch.sie_block->scaoh = scaoh; - vcpu->arch.sie_block->scaol = scaol; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - } - kvm->arch.sca = new_sca; - kvm->arch.use_esca = 1; - - write_unlock(&kvm->arch.sca_lock); - kvm_s390_vcpu_unblock_all(kvm); - - free_page((unsigned long)old_sca); - - VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)", - old_sca, kvm->arch.sca); - return 0; -} - static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { - int rc; + if (!kvm_s390_use_sca_entries()) + return id < KVM_MAX_VCPUS; - if (!kvm_s390_use_sca_entries()) { - if (id < KVM_MAX_VCPUS) - return true; - return false; - } - if (id < KVM_S390_BSCA_CPU_SLOTS) - return true; - if (!sclp.has_esca || !sclp.has_64bscao) - return false; - - rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - - return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; + return id < KVM_S390_ESCA_CPU_SLOTS; } /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ @@ -3920,7 +3818,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= ECA_IB; if (sclp.has_siif) vcpu->arch.sie_block->eca |= ECA_SII; - if (sclp.has_sigpif) + if (kvm_s390_use_sca_entries()) vcpu->arch.sie_block->eca |= ECA_SIGPI; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= ECA_VX; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c44fe0c3a097..65c950760993 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -570,13 +570,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); -/* support for Basic/Extended SCA handling */ -static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) -{ - struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */ - - return &sca->ipte_control; -} static inline int kvm_s390_use_sca_entries(void) { /* @@ -584,7 +577,7 @@ static inline int kvm_s390_use_sca_entries(void) * might use the entries. By not setting the entries and keeping them * invalid, hardware will not access them but intercept. */ - return sclp.has_sigpif; + return sclp.has_sigpif && sclp.has_esca; } void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct mcck_volatile_info *mcck_info); From 14542a0a54c5c84eebd9255e26cce9b1c15d9571 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Thu, 5 Jun 2025 18:14:05 +0200 Subject: [PATCH 028/260] KVM: S390: Remove sca_lock Since we are no longer switching from a BSCA to a ESCA we can completely get rid of the sca_lock. The write lock was only taken for that conversion. After removal of the lock some local code cleanups are possible. Signed-off-by: Christoph Schlameuss Suggested-by: Janosch Frank [frankja@linux.ibm.com: Added suggested-by tag as discussed on list] Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/gaccess.c | 19 ++--------------- arch/s390/kvm/interrupt.c | 36 ++++++++------------------------ arch/s390/kvm/kvm-s390.c | 34 ++++++++---------------------- 4 files changed, 20 insertions(+), 70 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3cf14dd75409..22cedcaea475 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -633,7 +633,6 @@ struct kvm_s390_pv { struct kvm_arch { struct esca_block *sca; - rwlock_t sca_lock; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; struct kvm_device *flic; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 3651ab682fd7..41ca6b0ee7a9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -109,14 +109,9 @@ struct aste { int ipte_lock_held(struct kvm *kvm) { - if (sclp.has_siif) { - int rc; + if (sclp.has_siif) + return kvm->arch.sca->ipte_control.kh != 0; - read_lock(&kvm->arch.sca_lock); - rc = kvm->arch.sca->ipte_control.kh != 0; - read_unlock(&kvm->arch.sca_lock); - return rc; - } return kvm->arch.ipte_lock_count != 0; } @@ -129,19 +124,16 @@ static void ipte_lock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); } @@ -154,14 +146,12 @@ static void ipte_unlock_simple(struct kvm *kvm) kvm->arch.ipte_lock_count--; if (kvm->arch.ipte_lock_count) goto out; - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -172,12 +162,10 @@ static void ipte_lock_siif(struct kvm *kvm) union ipte_control old, new, *ic; retry: - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -185,14 +173,12 @@ static void ipte_lock_siif(struct kvm *kvm) new.k = 1; new.kh++; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { @@ -201,7 +187,6 @@ static void ipte_unlock_siif(struct kvm *kvm) if (!new.kh) new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 36394ba897f5..220d9d00c23d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -45,48 +45,34 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { - union esca_sigp_ctrl sigp_ctrl; - struct esca_block *sca; - int c, scn; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) return 0; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) - *src_id = scn; + *src_id = sigp_ctrl.scn; - return c; + return sigp_ctrl.c; } static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { - union esca_sigp_ctrl old_val, new_val = {0}; - union esca_sigp_ctrl *sigp_ctrl; - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1}; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; old_val.c = 0; expect = old_val.value; rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - read_unlock(&vcpu->kvm->arch.sca_lock); if (rc != expect) { /* another external call is pending */ @@ -98,18 +84,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - union esca_sigp_ctrl *sigp_ctrl; - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; WRITE_ONCE(sigp_ctrl->value, 0); - read_unlock(&vcpu->kvm->arch.sca_lock); } int psw_extint_disabled(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 78468b96d250..769820e3a243 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1938,14 +1938,12 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) union sca_utility new, old; struct esca_block *sca; - read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; old = READ_ONCE(sca->utility); do { new = old; new.mtcr = val; } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static int kvm_s390_set_topo_change_indication(struct kvm *kvm, @@ -1966,9 +1964,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, if (!test_kvm_facility(kvm, 11)) return -ENXIO; - read_lock(&kvm->arch.sca_lock); topo = kvm->arch.sca->utility.mtcr; - read_unlock(&kvm->arch.sca_lock); return put_user(topo, (u8 __user *)attr->addr); } @@ -3345,7 +3341,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; - rwlock_init(&kvm->arch.sca_lock); mutex_lock(&kvm_lock); kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); @@ -3530,41 +3525,30 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; if (!kvm_s390_use_sca_entries()) return; - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); sca->cpu[vcpu->vcpu_id].sda = 0; - read_unlock(&vcpu->kvm->arch.sca_lock); } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { - struct esca_block *sca; - phys_addr_t sca_phys; + struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - if (!kvm_s390_use_sca_entries()) { - sca_phys = virt_to_phys(vcpu->kvm->arch.sca); - - /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - return; - } - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + /* we still need the sca header for the ipte control */ vcpu->arch.sie_block->scaoh = sca_phys >> 32; vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + + if (!kvm_s390_use_sca_entries()) + return; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); - read_unlock(&vcpu->kvm->arch.sca_lock); + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); } static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) From 7d5136ed1b218f7d68e15ff7a6d000a7ff3cce0f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Sep 2025 15:12:40 +0200 Subject: [PATCH 029/260] KVM: s390: Remove unused return variable in kvm_arch_vcpu_ioctl_set_fpu kvm_arch_vcpu_ioctl_set_fpu() always returns 0 and the local return variable 'ret' is not used anymore. Remove it. Signed-off-by: Thorsten Blum Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 769820e3a243..677aa5c7d226 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4249,8 +4249,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - int ret = 0; - vcpu_load(vcpu); vcpu->run->s.regs.fpc = fpu->fpc; @@ -4261,7 +4259,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu_put(vcpu); - return ret; + return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) From f5a6fa189ad2e73d25e31de3cc7f0bf81907c986 Mon Sep 17 00:00:00 2001 From: Josephine Pfeiffer Date: Wed, 1 Oct 2025 19:40:46 +0200 Subject: [PATCH 030/260] KVM: s390: Replace sprintf with snprintf for buffer safety Replace sprintf() with snprintf() when formatting debug names to prevent potential buffer overflow. The debug_name buffer is 16 bytes, and while unlikely to overflow with current PIDs, using snprintf() provides proper bounds checking. Signed-off-by: Josephine Pfeiffer [frankja@linux.ibm.com: Fixed subject prefix] Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 677aa5c7d226..70ebc54b1bb1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3348,7 +3348,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sca) goto out_err; - sprintf(debug_name, "kvm-%u", current->pid); + snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid); kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) From 182a258b5ec4a45170e776d3a0c0bccfc4fab998 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Wed, 13 Aug 2025 11:04:31 +0000 Subject: [PATCH 031/260] Documentation: kvm: Fix ordering 7.43 has been assigned twice, make KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 7.44. Fixes: f55ce5a6cd33 ("KVM: arm64: Expose new KVM cap for cacheable PFNMAP") Reviewed-by: Ankit Agrawal Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..72b2fae99a83 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8692,7 +8692,7 @@ given VM. When this capability is enabled, KVM resets the VCPU when setting MP_STATE_INIT_RECEIVED through IOCTL. The original MP_STATE is preserved. -7.43 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED +7.44 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED ------------------------------------------- :Architectures: arm64 From 7f3779a3ac3e474d043f0a2b77dd6e6bb020c577 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:43 +0000 Subject: [PATCH 032/260] mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Add a mempolicy parameter to filemap_alloc_folio() to enable NUMA-aware page cache allocations. This will be used by upcoming changes to support NUMA policies in guest-memfd, where guest_memory need to be allocated NUMA policy specified by VMM. All existing users pass NULL maintaining current behavior. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-4-shivankg@amd.com Signed-off-by: Sean Christopherson --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/verity.c | 2 +- fs/erofs/zdata.c | 2 +- fs/f2fs/compress.c | 2 +- include/linux/pagemap.h | 8 +++++--- mm/filemap.c | 14 +++++++++----- mm/readahead.c | 2 +- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..d927ae32e7d0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,8 +491,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, - ~__GFP_FS), 0); + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + 0, NULL); if (!folio) break; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..d4523d5debcd 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -742,7 +742,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, } folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), - 0); + 0, NULL); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc80cfe482f7..b7369fb4fbe9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ - newfolio = filemap_alloc_folio(gfp, 0); + newfolio = filemap_alloc_folio(gfp, 0, NULL); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..a65e8cd388bc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, return; } - cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); if (!cfolio) return; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..f1d0610210f7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..7b42fd6dcc9a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1002,11 +1002,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -2009,7 +2014,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, NULL); if (!folio) continue; @@ -2551,7 +2556,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3983,8 +3988,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); diff --git a/mm/readahead.c b/mm/readahead.c index 3a4b5d58eeb6..b415c9969176 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl, { struct folio *folio; - folio = filemap_alloc_folio(gfp_mask, order); + folio = filemap_alloc_folio(gfp_mask, order, NULL); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); From 16a542e22339cd5e73e56a956bbd335c7bd7c08c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:44 +0000 Subject: [PATCH 033/260] mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Extend __filemap_get_folio() to support NUMA memory policies by renaming the implementation to __filemap_get_folio_mpol() and adding a mempolicy parameter. The original function becomes a static inline wrapper that passes NULL for the mempolicy. This infrastructure will enable future support for NUMA-aware page cache allocations in guest_memfd memory backend KVM guests. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-5-shivankg@amd.com Signed-off-by: Sean Christopherson --- include/linux/pagemap.h | 10 ++++++++-- mm/filemap.c | 11 ++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f1d0610210f7..a17fabbc0269 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -755,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). diff --git a/mm/filemap.c b/mm/filemap.c index 7b42fd6dcc9a..91c4537283d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1928,11 +1928,12 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1943,8 +1944,8 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2014,7 +2015,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order, NULL); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2061,7 +2062,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) From f634f10809ec3d51d9529dfb0f99bc7cec1b5eff Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Wed, 27 Aug 2025 17:52:45 +0000 Subject: [PATCH 034/260] mm/mempolicy: Export memory policy symbols KVM guest_memfd wants to implement support for NUMA policies just like shmem already does using the shared policy infrastructure. As guest_memfd currently resides in KVM module code, we have to export the relevant symbols. In the future, guest_memfd might be moved to core-mm, at which point the symbols no longer would have to be exported. When/if that happens is still unclear. Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-6-shivankg@amd.com Signed-off-by: Sean Christopherson --- mm/mempolicy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb83cff7db8c..3d797d47a040 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -354,6 +354,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) return &default_policy; } +EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); @@ -487,6 +488,7 @@ void __mpol_put(struct mempolicy *pol) return; kmem_cache_free(policy_cache, pol); } +EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { @@ -2885,6 +2887,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, read_unlock(&sp->lock); return pol; } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); static void sp_free(struct sp_node *n) { @@ -3170,6 +3173,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) @@ -3188,6 +3192,7 @@ int mpol_set_shared_policy(struct shared_policy *sp, sp_free(new); return err; } +EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) @@ -3206,6 +3211,7 @@ void mpol_free_shared_policy(struct shared_policy *sp) } write_unlock(&sp->lock); } +EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; From 049e560d4f47960c31c06f3de7712e4d2c5d16a2 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 12 Oct 2025 07:16:06 +0000 Subject: [PATCH 035/260] KVM: guest_memfd: move kvm_gmem_get_index() and use in kvm_gmem_prepare_folio() Move kvm_gmem_get_index() to the top of the file so that it can be used in kvm_gmem_prepare_folio() to replace the open-coded calculation. No functional change intended. Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Link: https://lore.kernel.org/r/20251012071607.17646-1-shivankg@amd.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fbca8c0972da..22dacf49a04d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -25,6 +25,11 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index, struct folio *folio) { @@ -78,7 +83,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, * checked when creating memslots. */ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); - index = gfn - slot->base_gfn + slot->gmem.pgoff; + index = kvm_gmem_get_index(slot, gfn); index = ALIGN_DOWN(index, 1 << folio_order(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) @@ -335,11 +340,6 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } -static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return gfn - slot->base_gfn + slot->gmem.pgoff; -} - static bool kvm_gmem_supports_mmap(struct inode *inode) { const u64 flags = (u64)inode->i_private; From 3f1078a445d9038532a572ff643a826ed9335259 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 12 Oct 2025 07:16:07 +0000 Subject: [PATCH 036/260] KVM: guest_memfd: remove redundant gmem variable initialization Remove redundant initialization of gmem in __kvm_gmem_get_pfn() as it is already initialized at the top of the function. No functional change intended. Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Link: https://lore.kernel.org/r/20251012071607.17646-2-shivankg@amd.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 22dacf49a04d..caa87efc8f7a 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -668,7 +668,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, return ERR_PTR(-EFAULT); } - gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); return ERR_PTR(-EIO); From 765fcd7c0753cc62d0a839cbd8355cfaf57a7eb6 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Sat, 4 Oct 2025 00:02:10 -0300 Subject: [PATCH 037/260] KVM: guest_memfd: use folio_nr_pages() instead of shift operation folio_nr_pages() is a faster helper function to get the number of pages when NR_PAGES_IN_LARGE_FOLIO is enabled. Signed-off-by: Pedro Demarchi Gomes Link: https://lore.kernel.org/r/20251004030210.49080-1-pedrodemargomes@gmail.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index caa87efc8f7a..9017e4d77c53 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -82,9 +82,9 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, * The order will be passed when creating the guest_memfd, and * checked when creating memslots. */ - WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); index = kvm_gmem_get_index(slot, gfn); - index = ALIGN_DOWN(index, 1 << folio_order(folio)); + index = ALIGN_DOWN(index, folio_nr_pages(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) kvm_gmem_mark_prepared(folio); From 5f3e10797ab883e9dcc256f21094f039c2bb3143 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:27:33 -0700 Subject: [PATCH 038/260] KVM: guest_memfd: Drop a superfluous local var in kvm_gmem_fault_user_mapping() Drop the local "int err" that's buried in the middle guest_memfd's user fault handler to avoid the potential for variable shadowing, e.g. if an "err" variable were also declared at function scope. No functional change intended. Link: https://lore.kernel.org/r/20251007222733.349460-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9017e4d77c53..1e4af29159ea 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -361,12 +361,10 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { - int err = PTR_ERR(folio); - - if (err == -EAGAIN) + if (PTR_ERR(folio) == -EAGAIN) return VM_FAULT_RETRY; - return vmf_error(err); + return vmf_error(PTR_ERR(folio)); } if (WARN_ON_ONCE(folio_test_large(folio))) { From 497b1dfbcacf4e45c9cd3f594959918ca0e4536b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:42 -0700 Subject: [PATCH 039/260] KVM: guest_memfd: Rename "struct kvm_gmem" to "struct gmem_file" Rename the "kvm_gmem" structure to "gmem_file" in anticipation of using dedicated guest_memfd inodes instead of anonyomous inodes, at which point the "kvm_gmem" nomenclature becomes quite misleading. In guest_memfd, inodes are effectively the raw underlying physical storage, and will be used to track properties of the physical memory, while each gmem file is effectively a single VM's view of that storage, and is used to track assets specific to its associated VM, e.g. memslots=>gmem bindings. Using "kvm_gmem" suggests that the per-VM/per-file structures are _the_ guest_memfd instance, which almost the exact opposite of reality. Opportunistically rename local variables from "gmem" to "f", again to avoid confusion once guest_memfd specific inodes come along. No functional change intended. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 98 +++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1e4af29159ea..2989c5fe426f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -7,7 +7,16 @@ #include "kvm_mm.h" -struct kvm_gmem { +/* + * A guest_memfd instance can be associated multiple VMs, each with its own + * "view" of the underlying physical memory. + * + * The gmem's inode is effectively the raw underlying physical storage, and is + * used to track properties of the physical memory, while each gmem file is + * effectively a single VM's view of that storage, and is used to track assets + * specific to its associated VM, e.g. memslots=>gmem bindings. + */ +struct gmem_file { struct kvm *kvm; struct xarray bindings; struct list_head entry; @@ -115,16 +124,16 @@ static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *in return KVM_FILTER_PRIVATE; } -static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, pgoff_t end, enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; - xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { @@ -157,20 +166,20 @@ static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, { struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; - struct kvm_gmem *gmem; + struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } -static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; - if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); @@ -181,10 +190,10 @@ static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; - struct kvm_gmem *gmem; + struct gmem_file *f; - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_end(gmem, start, end); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_end(f, start, end); } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -282,9 +291,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, static int kvm_gmem_release(struct inode *inode, struct file *file) { - struct kvm_gmem *gmem = file->private_data; + struct gmem_file *f = file->private_data; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; /* @@ -304,7 +313,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) filemap_invalidate_lock(inode->i_mapping); - xa_for_each(&gmem->bindings, index, slot) + xa_for_each(&f->bindings, index, slot) WRITE_ONCE(slot->gmem.file, NULL); /* @@ -312,18 +321,18 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ - __kvm_gmem_invalidate_begin(gmem, 0, -1ul, + __kvm_gmem_invalidate_begin(f, 0, -1ul, kvm_gmem_get_invalidate_filter(inode)); - __kvm_gmem_invalidate_end(gmem, 0, -1ul); + __kvm_gmem_invalidate_end(f, 0, -1ul); - list_del(&gmem->entry); + list_del(&f->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); - xa_destroy(&gmem->bindings); - kfree(gmem); + xa_destroy(&f->bindings); + kfree(f); kvm_put_kvm(kvm); @@ -491,7 +500,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int fd, err; @@ -500,14 +509,13 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) if (fd < 0) return fd; - gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); - if (!gmem) { + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) { err = -ENOMEM; goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, - O_RDWR, NULL); + file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_gmem; @@ -529,15 +537,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); kvm_get_kvm(kvm); - gmem->kvm = kvm; - xa_init(&gmem->bindings); - list_add(&gmem->entry, &inode->i_mapping->i_private_list); + f->kvm = kvm; + xa_init(&f->bindings); + list_add(&f->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_gmem: - kfree(gmem); + kfree(f); err_fd: put_unused_fd(fd); return err; @@ -562,7 +570,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int r = -EINVAL; @@ -576,8 +584,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (file->f_op != &kvm_gmem_fops) goto err; - gmem = file->private_data; - if (gmem->kvm != kvm) + f = file->private_data; + if (f->kvm != kvm) goto err; inode = file_inode(file); @@ -591,8 +599,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, start = offset >> PAGE_SHIFT; end = start + slot->npages; - if (!xa_empty(&gmem->bindings) && - xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (!xa_empty(&f->bindings) && + xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } @@ -607,7 +615,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (kvm_gmem_supports_mmap(inode)) slot->flags |= KVM_MEMSLOT_GMEM_ONLY; - xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* @@ -625,7 +633,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; + struct gmem_file *f; struct file *file; /* @@ -636,10 +644,10 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) if (!file) return; - gmem = file->private_data; + f = file->private_data; filemap_invalidate_lock(file->f_mapping); - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); /* * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() @@ -657,17 +665,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, pgoff_t index, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { - struct file *gmem_file = READ_ONCE(slot->gmem.file); - struct kvm_gmem *gmem = file->private_data; + struct file *slot_file = READ_ONCE(slot->gmem.file); + struct gmem_file *f = file->private_data; struct folio *folio; - if (file != gmem_file) { - WARN_ON_ONCE(gmem_file); + if (file != slot_file) { + WARN_ON_ONCE(slot_file); return ERR_PTR(-EFAULT); } - if (xa_load(&gmem->bindings, index) != slot) { - WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + if (xa_load(&f->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&f->bindings, index)); return ERR_PTR(-EIO); } From 392dd9d9488a8a81c2d58f9f4eee99c5b7b8e1c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:43 -0700 Subject: [PATCH 040/260] KVM: guest_memfd: Add macro to iterate over gmem_files for a mapping/inode Add a kvm_gmem_for_each_file() to make it more obvious that KVM is iterating over guest_memfd _files_, not guest_memfd instances, as could be assumed given the name "gmem_list". No functional change intended. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-3-seanjc@google.com [sean: drop .clang-format change] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 2989c5fe426f..5cce20ff418d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -22,6 +22,9 @@ struct gmem_file { struct list_head entry; }; +#define kvm_gmem_for_each_file(f, mapping) \ + list_for_each_entry(f, &(mapping)->i_private_list, entry) + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -164,13 +167,12 @@ static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } @@ -189,10 +191,9 @@ static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct gmem_file *f; - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_end(f, start, end); } From a63ca4236e6799cf4343f9aec9d92afdfa582446 Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Thu, 16 Oct 2025 10:28:44 -0700 Subject: [PATCH 041/260] KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes guest_memfd's inode represents memory the guest_memfd is providing. guest_memfd's file represents a struct kvm's view of that memory. Using a custom inode allows customization of the inode teardown process via callbacks. For example, ->evict_inode() allows customization of the truncation process on file close, and ->destroy_inode() and ->free_inode() allow customization of the inode freeing process. Customizing the truncation process allows flexibility in management of guest_memfd memory and customization of the inode freeing process allows proper cleanup of memory metadata stored on the inode. Memory metadata is more appropriately stored on the inode (as opposed to the file), since the metadata is for the memory and is not unique to a specific binding and struct kvm. Acked-by: David Hildenbrand Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Ackerley Tng Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: drop helpers, open code logic in __kvm_gmem_create()] Link: https://lore.kernel.org/r/20251016172853.52451-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/uapi/linux/magic.h | 1 + virt/kvm/guest_memfd.c | 82 +++++++++++++++++++++++++++++++------- virt/kvm/kvm_main.c | 7 +++- virt/kvm/kvm_mm.h | 9 +++-- 4 files changed, 80 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index bb575f3ab45e..638ca21b7a90 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -103,5 +103,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ +#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5cce20ff418d..ce04fc85e631 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -1,12 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include +#include #include +#include #include -#include #include "kvm_mm.h" +static struct vfsmount *kvm_gmem_mnt; + /* * A guest_memfd instance can be associated multiple VMs, each with its own * "view" of the underlying physical memory. @@ -424,11 +428,6 @@ static struct file_operations kvm_gmem_fops = { .fallocate = kvm_gmem_fallocate, }; -void kvm_gmem_init(struct module *module) -{ - kvm_gmem_fops.owner = module; -} - static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -500,7 +499,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { - const char *anon_name = "[kvm-gmem]"; + static const char *name = "[kvm-gmem]"; struct gmem_file *f; struct inode *inode; struct file *file; @@ -516,16 +515,17 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); - if (IS_ERR(file)) { - err = PTR_ERR(file); + /* __fput() will take care of fops_put(). */ + if (!fops_get(&kvm_gmem_fops)) { + err = -ENOENT; goto err_gmem; } - file->f_flags |= O_LARGEFILE; - - inode = file->f_inode; - WARN_ON(file->f_mapping != inode->i_mapping); + inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_fops; + } inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; @@ -537,6 +537,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_inode; + } + + file->f_flags |= O_LARGEFILE; + file->private_data = f; + kvm_get_kvm(kvm); f->kvm = kvm; xa_init(&f->bindings); @@ -545,6 +554,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) fd_install(fd, file); return fd; +err_inode: + iput(inode); +err_fops: + fops_put(&kvm_gmem_fops); err_gmem: kfree(f); err_fd: @@ -816,3 +829,44 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif + +static int kvm_gmem_init_fs_context(struct fs_context *fc) +{ + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + + return 0; +} + +static struct file_system_type kvm_gmem_fs = { + .name = "guest_memfd", + .init_fs_context = kvm_gmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int kvm_gmem_init_mount(void) +{ + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); + + if (IS_ERR(kvm_gmem_mnt)) + return PTR_ERR(kvm_gmem_mnt); + + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; + return 0; +} + +int kvm_gmem_init(struct module *module) +{ + kvm_gmem_fops.owner = module; + + return kvm_gmem_init_mount(); +} + +void kvm_gmem_exit(void) +{ + kern_unmount(kvm_gmem_mnt); + kvm_gmem_mnt = NULL; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4845e5739436 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6517,7 +6517,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; - kvm_gmem_init(module); + r = kvm_gmem_init(module); + if (r) + goto err_gmem; r = kvm_init_virtualization(); if (r) @@ -6538,6 +6540,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) err_register: kvm_uninit_virtualization(); err_virt: + kvm_gmem_exit(); +err_gmem: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6569,6 +6573,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_gmem_exit(); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 31defb08ccba..9fcc5d5b7f8d 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, #endif /* HAVE_KVM_PFNCACHE */ #ifdef CONFIG_KVM_GUEST_MEMFD -void kvm_gmem_init(struct module *module); +int kvm_gmem_init(struct module *module); +void kvm_gmem_exit(void); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset); void kvm_gmem_unbind(struct kvm_memory_slot *slot); #else -static inline void kvm_gmem_init(struct module *module) +static inline int kvm_gmem_init(struct module *module) { - + return 0; } - +static inline void kvm_gmem_exit(void) {}; static inline int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) From f609e89ae8936dbba8992ada83d27ece5cb8393a Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:45 -0700 Subject: [PATCH 042/260] KVM: guest_memfd: Add slab-allocated inode cache Add a dedicated gmem_inode structure and a slab-allocated inode cache for guest memory backing, similar to how shmem handles inodes. This adds the necessary allocation/destruction functions and prepares for upcoming guest_memfd NUMA policy support changes. Using a dedicated structure will also allow for additional cleanups, e.g. to track flags in gmem_inode instead of i_private. Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: s/kvm_gmem_inode_info/gmem_inode, name init_once()] Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251016172853.52451-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 79 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ce04fc85e631..88fd812f0f31 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -26,6 +26,15 @@ struct gmem_file { struct list_head entry; }; +struct gmem_inode { + struct inode vfs_inode; +}; + +static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) +{ + return container_of(inode, struct gmem_inode, vfs_inode); +} + #define kvm_gmem_for_each_file(f, mapping) \ list_for_each_entry(f, &(mapping)->i_private_list, entry) @@ -830,13 +839,61 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif +static struct kmem_cache *kvm_gmem_inode_cachep; + +static void kvm_gmem_init_inode_once(void *__gi) +{ + struct gmem_inode *gi = __gi; + + /* + * Note! Don't initialize the inode with anything specific to the + * guest_memfd instance, or that might be specific to how the inode is + * used (from the VFS-layer's perspective). This hook is called only + * during the initial slab allocation, i.e. only fields/state that are + * idempotent across _all_ use of the inode _object_ can be initialized + * at this time! + */ + inode_init_once(&gi->vfs_inode); +} + +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) +{ + struct gmem_inode *gi; + + gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); + if (!gi) + return NULL; + + return &gi->vfs_inode; +} + +static void kvm_gmem_destroy_inode(struct inode *inode) +{ +} + +static void kvm_gmem_free_inode(struct inode *inode) +{ + kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); +} + +static const struct super_operations kvm_gmem_super_operations = { + .statfs = simple_statfs, + .alloc_inode = kvm_gmem_alloc_inode, + .destroy_inode = kvm_gmem_destroy_inode, + .free_inode = kvm_gmem_free_inode, +}; + static int kvm_gmem_init_fs_context(struct fs_context *fc) { + struct pseudo_fs_context *ctx; + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; + ctx = fc->fs_private; + ctx->ops = &kvm_gmem_super_operations; return 0; } @@ -860,13 +917,31 @@ static int kvm_gmem_init_mount(void) int kvm_gmem_init(struct module *module) { - kvm_gmem_fops.owner = module; + struct kmem_cache_args args = { + .align = 0, + .ctor = kvm_gmem_init_inode_once, + }; + int ret; - return kvm_gmem_init_mount(); + kvm_gmem_fops.owner = module; + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", + sizeof(struct gmem_inode), + &args, SLAB_ACCOUNT); + if (!kvm_gmem_inode_cachep) + return -ENOMEM; + + ret = kvm_gmem_init_mount(); + if (ret) { + kmem_cache_destroy(kvm_gmem_inode_cachep); + return ret; + } + return 0; } void kvm_gmem_exit(void) { kern_unmount(kvm_gmem_mnt); kvm_gmem_mnt = NULL; + rcu_barrier(); + kmem_cache_destroy(kvm_gmem_inode_cachep); } From ed1ffa810bd600ae40c794d87e4ca587dee5fa3a Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:46 -0700 Subject: [PATCH 043/260] KVM: guest_memfd: Enforce NUMA mempolicy using shared policy Previously, guest-memfd allocations followed local NUMA node id in absence of process mempolicy, resulting in arbitrary memory allocation. Moreover, mbind() couldn't be used by the VMM as guest memory wasn't mapped into userspace when allocation occurred. Enable NUMA policy support by implementing vm_ops for guest-memfd mmap operation. This allows the VMM to use mmap()+mbind() to set the desired NUMA policy for a range of memory, and provides fine-grained control over guest memory allocation across NUMA nodes. Note, using mmap()+mbind() works even for PRIVATE memory, as mbind() doesn't require the memory to be faulted in. However, get_mempolicy() and other paths that require the userspace page tables to be populated may return incorrect information for PRIVATE memory (though under the hood, KVM+guest_memfd will still behave correctly). Store the policy in the inode structure, gmem_inode, as a shared memory policy, so that the policy is a property of the physical memory itself, i.e. not bound to the VMA. In guest_memfd, KVM is the primary MMU and any VMAs are secondary, i.e. using mbind() on a VMA to set policy is a means to an end, e.g. to avoid having to add a file-based equivalent to mbind(). Similarly, retrieve the policy via mpol_shared_policy_lookup(), not get_vma_policy(), even when allocating to fault in memory for userspace mappings, so that the policy stored in gmem_inode is always the source of true. Apply policy changes only to future allocations, i.e. do not migrate existing memory in the guest_memfd instance. This matches mbind(2)'s default behavior, which affects only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags (which are not supported by guest_memfd as guest_memfd memory is unmovable). Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/all/e9d43abc-bcdb-4f9f-9ad7-5644f714de19@amd.com [sean: fold in fixup (see Link above), massage changelog] Link: https://lore.kernel.org/r/20251016172853.52451-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 58 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 88fd812f0f31..4463643bd0a2 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ struct gmem_file { }; struct gmem_inode { + struct shared_policy policy; struct inode vfs_inode; }; @@ -129,7 +131,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) @@ -411,8 +431,40 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) return ret; } +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Return the memory policy for this index, or NULL if none is set. + * + * Returning NULL, e.g. instead of the current task's memory policy, is + * important for the .get_policy kernel ABI: it indicates that no + * explicit policy has been set via mbind() for this memory. The caller + * can then replace NULL with the default memory policy instead of the + * current task's memory policy. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + static const struct vm_operations_struct kvm_gmem_vm_ops = { - .fault = kvm_gmem_fault_user_mapping, + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) @@ -864,11 +916,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) if (!gi) return NULL; + mpol_shared_policy_init(&gi->policy, NULL); return &gi->vfs_inode; } static void kvm_gmem_destroy_inode(struct inode *inode) { + mpol_free_shared_policy(&GMEM_I(inode)->policy); } static void kvm_gmem_free_inode(struct inode *inode) From 3223560c93eb725f479fe20f462ce202a44803b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:47 -0700 Subject: [PATCH 044/260] KVM: selftests: Define wrappers for common syscalls to assert success Add kvm_ wrappers for munmap(), close(), fallocate(), and ftruncate() to cut down on boilerplate code when a sycall is expected to succeed, and to make it easier for developers to remember to assert success. Implement and use a macro framework similar to the kernel's SYSCALL_DEFINE infrastructure to further cut down on boilerplate code, and to drastically reduce the probability of typos as the kernel's syscall definitions can be copy+paste almost verbatim. Provide macros to build the raw () wrappers as well, e.g. to replace hand-coded wrappers (NUMA) or pure open-coded calls. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-7-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 2 +- .../selftests/kvm/include/kvm_syscalls.h | 81 +++++++++++++++++++ .../testing/selftests/kvm/include/kvm_util.h | 29 +------ .../selftests/kvm/kvm_binary_stats_test.c | 4 +- tools/testing/selftests/kvm/lib/kvm_util.c | 31 ++----- .../kvm/x86/private_mem_conversions_test.c | 9 +-- 6 files changed, 96 insertions(+), 60 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/kvm_syscalls.h diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..8d7758f12280 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -636,7 +636,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) - close(fd[f]); + kvm_close(fd[f]); } /* handles the valid case: intid=0xffffffff num=1 */ diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h new file mode 100644 index 000000000000..d4e613162bba --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_SYSCALLS_H +#define SELFTEST_KVM_SYSCALLS_H + +#include + +#define MAP_ARGS0(m,...) +#define MAP_ARGS1(m,t,a,...) m(t,a) +#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) +#define MAP_ARGS3(m,t,a,...) m(t,a), MAP_ARGS2(m,__VA_ARGS__) +#define MAP_ARGS4(m,t,a,...) m(t,a), MAP_ARGS3(m,__VA_ARGS__) +#define MAP_ARGS5(m,t,a,...) m(t,a), MAP_ARGS4(m,__VA_ARGS__) +#define MAP_ARGS6(m,t,a,...) m(t,a), MAP_ARGS5(m,__VA_ARGS__) +#define MAP_ARGS(n,...) MAP_ARGS##n(__VA_ARGS__) + +#define __DECLARE_ARGS(t, a) t a +#define __UNPACK_ARGS(t, a) a + +#define DECLARE_ARGS(nr_args, args...) MAP_ARGS(nr_args, __DECLARE_ARGS, args) +#define UNPACK_ARGS(nr_args, args...) MAP_ARGS(nr_args, __UNPACK_ARGS, args) + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* Define a kvm_() API to assert success. */ +#define __KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline void kvm_##name(DECLARE_ARGS(nr_args, args)) \ +{ \ + int r; \ + \ + r = name(UNPACK_ARGS(nr_args, args)); \ + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR(#name, r)); \ +} + +/* + * Macro to define syscall APIs, either because KVM selftests doesn't link to + * the standard library, e.g. libnuma, or because there is no library that yet + * provides the syscall. These + */ +#define KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline long name(DECLARE_ARGS(nr_args, args)) \ +{ \ + return syscall(__NR_##name, UNPACK_ARGS(nr_args, args)); \ +} \ +__KVM_SYSCALL_DEFINE(name, nr_args, args) + +/* + * Special case mmap(), as KVM selftest rarely/never specific an address, + * rarely specify an offset, and because the unique return code requires + * special handling anyways. + */ +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline int kvm_dup(int fd) +{ + int new_fd = dup(fd); + + TEST_ASSERT(new_fd >= 0, __KVM_SYSCALL_ERROR("dup()", new_fd)); + return new_fd; +} + +__KVM_SYSCALL_DEFINE(munmap, 2, void *, mem, size_t, size); +__KVM_SYSCALL_DEFINE(close, 1, int, fd); +__KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len); +__KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); + +#endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..af52cd938b50 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -23,6 +23,7 @@ #include +#include "kvm_syscalls.h" #include "kvm_util_arch.h" #include "kvm_util_types.h" #include "sparsebit.h" @@ -283,34 +284,6 @@ static inline bool kvm_has_cap(long cap) return kvm_check_cap(cap); } -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, - off_t offset) -{ - void *mem; - - mem = mmap(NULL, size, prot, flags, fd, offset); - TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", - (int)(unsigned long)MAP_FAILED)); - - return mem; -} - -static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) -{ - return __kvm_mmap(size, prot, flags, fd, 0); -} - -static inline void kvm_munmap(void *mem, size_t size) -{ - int ret; - - ret = munmap(mem, size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); -} - /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02355c3c4c2..b7dbde9c0843 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -239,14 +239,14 @@ int main(int argc, char *argv[]) * single stats file works and doesn't cause explosions. */ vm_stats_fds = vm_get_stats_fd(vms[i]); - stats_test(dup(vm_stats_fds)); + stats_test(kvm_dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); - stats_test(dup(vcpu_stats_fds[j])); + stats_test(kvm_dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..203e33697492 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -704,8 +704,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) static void kvm_stats_release(struct kvm_binary_stats *stats) { - int ret; - if (stats->fd < 0) return; @@ -714,8 +712,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - ret = close(stats->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(stats->fd); stats->fd = -1; } @@ -738,8 +735,6 @@ __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) */ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - int ret; - if (vcpu->dirty_gfns) { kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; @@ -747,9 +742,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - ret = close(vcpu->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - + kvm_close(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -761,16 +754,12 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { struct kvm_vcpu *vcpu, *tmp; - int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - ret = close(vmp->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - - ret = close(vmp->kvm_fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(vmp->fd); + kvm_close(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -828,7 +817,7 @@ void kvm_vm_free(struct kvm_vm *vmp) int kvm_memfd_alloc(size_t size, bool hugepages) { int memfd_flags = MFD_CLOEXEC; - int fd, r; + int fd; if (hugepages) memfd_flags |= MFD_HUGETLB; @@ -836,11 +825,8 @@ int kvm_memfd_alloc(size_t size, bool hugepages) fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); - r = ftruncate(fd, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); - - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_ftruncate(fd, size); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); return fd; } @@ -1084,8 +1070,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, * needing to track if the fd is owned by the framework * or by the caller. */ - guest_memfd = dup(guest_memfd); - TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + guest_memfd = kvm_dup(guest_memfd); } region->region.guest_memfd = guest_memfd; diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 82a8d88b5338..1969f4ab9b28 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -380,7 +380,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; pthread_t threads[KVM_MAX_VCPUS]; struct kvm_vm *vm; - int memfd, i, r; + int memfd, i; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, @@ -428,11 +428,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t * should prevent the VM from being fully destroyed until the last * reference to the guest_memfd is also put. */ - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); - - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); close(memfd); } From 29dc539d74abd1ec4e509fda29bc821f3586d333 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:48 -0700 Subject: [PATCH 045/260] KVM: selftests: Report stacktraces SIGBUS, SIGSEGV, SIGILL, and SIGFPE by default Register handlers for signals for all selftests that are likely happen due to test (or kernel) bugs, and explicitly fail tests on unexpected signals so that users get a stack trace, i.e. don't have to go spelunking to do basic triage. Register the handlers as early as possible, to catch as many unexpected signals as possible, and also so that the common code doesn't clobber a handler that's installed by test (or arch) code. Tested-by: Ackerley Tng Reviewed-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-8-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 203e33697492..5744643d9ec3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2290,11 +2290,35 @@ __weak void kvm_selftest_arch_init(void) { } +static void report_unexpected_signal(int signum) +{ +#define KVM_CASE_SIGNUM(sig) \ + case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum) + + switch (signum) { + KVM_CASE_SIGNUM(SIGBUS); + KVM_CASE_SIGNUM(SIGSEGV); + KVM_CASE_SIGNUM(SIGILL); + KVM_CASE_SIGNUM(SIGFPE); + default: + TEST_FAIL("Unexpected signal %d\n", signum); + } +} + void __attribute((constructor)) kvm_selftest_init(void) { + struct sigaction sig_sa = { + .sa_handler = report_unexpected_signal, + }; + /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + sigaction(SIGBUS, &sig_sa, NULL); + sigaction(SIGSEGV, &sig_sa, NULL); + sigaction(SIGILL, &sig_sa, NULL); + sigaction(SIGFPE, &sig_sa, NULL); + guest_random_seed = last_guest_seed = random(); pr_info("Random seed: 0x%x\n", guest_random_seed); From 2189d78269c5055e6014862cb4d47f0613204856 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:49 -0700 Subject: [PATCH 046/260] KVM: selftests: Add additional equivalents to libnuma APIs in KVM's numaif.h Add APIs for all syscalls defined in the kernel's mm/mempolicy.c to match those that would be provided by linking to libnuma. Opportunistically use the recently inroduced KVM_SYSCALL_DEFINE() builders to take care of the boilerplate, and to fix a flaw where the two existing wrappers would generate multiple symbols if numaif.h were to be included multiple times. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-9-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 36 +++++++++++-------- .../selftests/kvm/x86/xapic_ipi_test.c | 5 ++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index b020547403fd..aaa4ac174890 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -13,23 +13,29 @@ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H -#define __NR_get_mempolicy 239 -#define __NR_migrate_pages 256 +#include -/* System calls */ -long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags) -{ - return syscall(__NR_get_mempolicy, policy, nmask, - maxnode, addr, flags); -} +#include "kvm_syscalls.h" -long migrate_pages(int pid, unsigned long maxnode, - const unsigned long *frommask, - const unsigned long *tomask) -{ - return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); -} +KVM_SYSCALL_DEFINE(get_mempolicy, 5, int *, policy, const unsigned long *, nmask, + unsigned long, maxnode, void *, addr, int, flags); + +KVM_SYSCALL_DEFINE(set_mempolicy, 3, int, mode, const unsigned long *, nmask, + unsigned long, maxnode); + +KVM_SYSCALL_DEFINE(set_mempolicy_home_node, 4, unsigned long, start, + unsigned long, len, unsigned long, home_node, + unsigned long, flags); + +KVM_SYSCALL_DEFINE(migrate_pages, 4, int, pid, unsigned long, maxnode, + const unsigned long *, frommask, const unsigned long *, tomask); + +KVM_SYSCALL_DEFINE(move_pages, 6, int, pid, unsigned long, count, void *, pages, + const int *, nodes, int *, status, int, flags); + +KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, + const unsigned long *, nodemask, unsigned long, maxnode, + unsigned int, flags); /* Policies */ #define MPOL_DEFAULT 0 diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index 35cb9de54a82..ae4a4b6c05ca 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -256,7 +256,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, int nodes = 0; time_t start_time, last_update, now; time_t interval_secs = 1; - int i, r; + int i; int from, to; unsigned long bit; uint64_t hlt_count; @@ -267,9 +267,8 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, delay_usecs); /* Get set of first 64 numa nodes available */ - r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + kvm_get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, 0, MPOL_F_MEMS_ALLOWED); - TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " "(each 1-bit indicates node is present): %#lx\n", From fe7baebb99de95f171911d085abcfab1db04a7ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:50 -0700 Subject: [PATCH 047/260] KVM: selftests: Use proper uAPI headers to pick up mempolicy.h definitions Drop the KVM's re-definitions of MPOL_xxx flags in numaif.h as they are defined by the already-included, kernel-provided mempolicy.h. The only reason the duplicate definitions don't cause compiler warnings is because they are identical, but only on x86-64! The syscall numbers in particular are subtly x86_64-specific, i.e. will cause problems if/when numaif.h is used outsize of x86. Opportunistically clean up the file comment as the license information is covered by the SPDX header, the path is superfluous, and as above the comment about the contents is flat out wrong. Fixes: 346b59f220a2 ("KVM: selftests: Add missing header file needed by xAPIC IPI tests") Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-10-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 32 +------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index aaa4ac174890..1554003c40a1 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -1,14 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/numaif.h - * - * Copyright (C) 2020, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Header file that provides access to NUMA API functions not explicitly - * exported to user space. - */ +/* Copyright (C) 2020, Google LLC. */ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H @@ -37,25 +28,4 @@ KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, const unsigned long *, nodemask, unsigned long, maxnode, unsigned int, flags); -/* Policies */ -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#define MPOL_MAX MPOL_INTERLEAVE - -/* Flags for get_mem_policy */ -#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ - /* Warning: MPOL_F_NODE is unsupported and - * subject to change. Don't use. - */ -#define MPOL_F_ADDR (1<<1) /* look up vma using address */ -#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ - -/* Flags for mbind */ -#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ - #endif /* SELFTEST_KVM_NUMAIF_H */ From e698e89b3ed1772e543a5ab4c072b9bbdb730948 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:51 -0700 Subject: [PATCH 048/260] KVM: selftests: Add helpers to probe for NUMA support, and multi-node systems Add NUMA helpers to probe for support/availability and to check if the test is running on a multi-node system. The APIs will be used to verify guest_memfd NUMA support. Signed-off-by: Shivank Garg [sean: land helpers in numaif.h, add comments, tweak names] Link: https://lore.kernel.org/r/20251016172853.52451-11-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index 1554003c40a1..29572a6d789c 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -4,6 +4,8 @@ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H +#include + #include #include "kvm_syscalls.h" @@ -28,4 +30,54 @@ KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, const unsigned long *, nodemask, unsigned long, maxnode, unsigned int, flags); +static inline int get_max_numa_node(void) +{ + struct dirent *de; + int max_node = 0; + DIR *d; + + /* + * Assume there's a single node if the kernel doesn't support NUMA, + * or if no nodes are found. + */ + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + + while ((de = readdir(d)) != NULL) { + int node_id; + char *endptr; + + if (strncmp(de->d_name, "node", 4) != 0) + continue; + + node_id = strtol(de->d_name + 4, &endptr, 10); + if (*endptr != '\0') + continue; + + if (node_id > max_node) + max_node = node_id; + } + closedir(d); + + return max_node; +} + +static bool is_numa_available(void) +{ + /* + * Probe for NUMA by doing a dummy get_mempolicy(). If the syscall + * fails with ENOSYS, then the kernel was built without NUMA support. + * if the syscall fails with EPERM, then the process/user lacks the + * necessary capabilities (CAP_SYS_NICE). + */ + return !get_mempolicy(NULL, NULL, 0, NULL, 0) || + (errno != ENOSYS && errno != EPERM); +} + +static inline bool is_multi_numa_node_system(void) +{ + return is_numa_available() && get_max_numa_node() >= 1; +} + #endif /* SELFTEST_KVM_NUMAIF_H */ From 38ccc50ac037ab79609c33556b696064ef52521c Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:52 -0700 Subject: [PATCH 049/260] KVM: selftests: Add guest_memfd tests for mmap and NUMA policy support Add tests for NUMA memory policy binding and NUMA aware allocation in guest_memfd. This extends the existing selftests by adding proper validation for: - KVM GMEM set_policy and get_policy() vm_ops functionality using mbind() and get_mempolicy() - NUMA policy application before and after memory allocation Run the NUMA mbind() test with and without INIT_SHARED, as KVM should allow doing mbind(), madvise(), etc. on guest-private memory, e.g. so that userspace can set NUMA policy for CoCo VMs. Run the NUMA allocation test only for INIT_SHARED, i.e. if the host can't fault-in memory (via direct access, madvise(), etc.) as move_pages() returns -ENOENT if the page hasn't been faulted in (walks the host page tables to find the associated folio) [sean: don't skip entire test when running on non-NUMA system, test mbind() with private memory, provide more info in assert messages] Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20251016172853.52451-12-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..618c937f3c90 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -19,6 +19,7 @@ #include #include "kvm_util.h" +#include "numaif.h" #include "test_util.h" #include "ucall_common.h" @@ -75,6 +76,101 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } +static void test_mbind(int fd, size_t total_size) +{ + const unsigned long nodemask_0 = 1; /* nid: 0 */ + unsigned long nodemask = 0; + unsigned long maxnode = 8; + int policy; + char *mem; + int ret; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + /* Test MPOL_INTERLEAVE policy */ + kvm_mbind(mem, page_size * 2, MPOL_INTERLEAVE, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_INTERLEAVE && nodemask == nodemask_0, + "Wanted MPOL_INTERLEAVE (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_INTERLEAVE, nodemask_0, policy, nodemask); + + /* Test basic MPOL_BIND policy */ + kvm_mbind(mem + page_size * 2, page_size * 2, MPOL_BIND, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem + page_size * 2, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_BIND && nodemask == nodemask_0, + "Wanted MPOL_BIND (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_BIND, nodemask_0, policy, nodemask); + + /* Test MPOL_DEFAULT policy */ + kvm_mbind(mem, total_size, MPOL_DEFAULT, NULL, 0, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_DEFAULT && !nodemask, + "Wanted MPOL_DEFAULT (%u) and nodemask 0x0, got %u and 0x%lx", + MPOL_DEFAULT, policy, nodemask); + + /* Test with invalid policy */ + ret = mbind(mem, page_size, 999, &nodemask_0, maxnode, 0); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "mbind with invalid policy should fail with EINVAL"); + + kvm_munmap(mem, total_size); +} + +static void test_numa_allocation(int fd, size_t total_size) +{ + unsigned long node0_mask = 1; /* Node 0 */ + unsigned long node1_mask = 2; /* Node 1 */ + unsigned long maxnode = 8; + void *pages[4]; + int status[4]; + char *mem; + int i; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + for (i = 0; i < 4; i++) + pages[i] = (char *)mem + page_size * i; + + /* Set NUMA policy after allocation */ + memset(mem, 0xaa, page_size); + kvm_mbind(pages[0], page_size, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, page_size); + + /* Set NUMA policy before allocation */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + /* Validate if pages are allocated on specified NUMA nodes */ + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 1, "Expected page 0 on node 1, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 1, "Expected page 1 on node 1, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 0, "Expected page 2 on node 0, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 0, "Expected page 3 on node 0, got it on node %d", status[3]); + + /* Punch hole for all pages */ + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, total_size); + + /* Change NUMA policy nodes and reallocate */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 0, "Expected page 0 on node 0, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 0, "Expected page 1 on node 0, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 1, "Expected page 2 on node 1, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 1, "Expected page 3 on node 1, got it on node %d", status[3]); + + kvm_munmap(mem, total_size); +} + static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { const char val = 0xaa; @@ -273,11 +369,13 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(numa_allocation, vm, flags); } else { gmem_test(fault_private, vm, flags); } gmem_test(mmap_cow, vm, flags); + gmem_test(mbind, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } From e66438bb81c4ca773b51292c27e6b5baa34f9a5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:53 -0700 Subject: [PATCH 050/260] KVM: guest_memfd: Add gmem_inode.flags field instead of using i_private Track a guest_memfd instance's flags in gmem_inode instead of burying them in i_private. Burning an extra 8 bytes per inode is well worth the added clarity provided by explicit tracking. Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-13-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 4463643bd0a2..20f6e7fab58d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -30,6 +30,8 @@ struct gmem_file { struct gmem_inode { struct shared_policy policy; struct inode vfs_inode; + + u64 flags; }; static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) @@ -154,7 +156,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) { - if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED) + if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) return KVM_FILTER_SHARED; return KVM_FILTER_PRIVATE; @@ -385,9 +387,7 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) static bool kvm_gmem_supports_mmap(struct inode *inode) { - const u64 flags = (u64)inode->i_private; - - return flags & GUEST_MEMFD_FLAG_MMAP; + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -399,7 +399,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; folio = kvm_gmem_get_folio(inode, vmf->pgoff); @@ -588,7 +588,6 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fops; } - inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; @@ -598,6 +597,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + GMEM_I(inode)->flags = flags; + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); if (IS_ERR(file)) { err = PTR_ERR(file); @@ -917,6 +918,8 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) return NULL; mpol_shared_policy_init(&gi->policy, NULL); + + gi->flags = 0; return &gi->vfs_inode; } From 0bb4d9c39b76b7453040ec8fb27f69f8437d6fe1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:23:56 -0700 Subject: [PATCH 051/260] KVM: guest_memfd: Define a CLASS to get+put guest_memfd file from a memslot Add a CLASS to handle getting and putting a guest_memfd file given a memslot to reduce the amount of related boilerplate, and more importantly to minimize the chances of forgetting to put the file (thankfully the bug that prompted this didn't escape initial testing). Define a CLASS instead of using __free(fput) as _free() comes with subtle caveats related to FILO ordering (objects are freed in the order in which they are declared), and the recommended solution/workaround (declare file pointers exactly when they are initialized) is visually jarring relative to KVM's (and the kernel's) overall strict adherence to not mixing declarations and code. E.g. the use in kvm_gmem_populate() would be: slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; struct file *file __free(fput) = kvm_gmem_get_file(slot; if (!file) return -EFAULT; filemap_invalidate_lock(file->f_mapping); Note, using CLASS() still declares variables in the middle of code, but the syntactic sugar obfuscates the declaration, i.e. hides the anomaly to a large extent. No functional change intended. Link: https://lore.kernel.org/r/20251007222356.348349-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 20f6e7fab58d..427c0acee9d7 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -385,6 +385,9 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } +DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), + kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); + static bool kvm_gmem_supports_mmap(struct inode *inode) { return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; @@ -710,13 +713,12 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; struct gmem_file *f; - struct file *file; /* * Nothing to do if the underlying file was already closed (or is being * closed right now), kvm_gmem_release() invalidates all bindings. */ - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); if (!file) return; @@ -731,8 +733,6 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) */ WRITE_ONCE(slot->gmem.file, NULL); filemap_invalidate_unlock(file->f_mapping); - - fput(file); } /* Returns a locked folio on success. */ @@ -778,19 +778,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); - struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; int r = 0; + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); - if (IS_ERR(folio)) { - r = PTR_ERR(folio); - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!is_prepared) r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); @@ -802,8 +800,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, else folio_put(folio); -out: - fput(file); return r; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); @@ -812,7 +808,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { - struct file *file; struct kvm_memory_slot *slot; void __user *p; @@ -828,7 +823,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!kvm_slot_has_gmem(slot)) return -EINVAL; - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; @@ -886,7 +881,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long filemap_invalidate_unlock(file->f_mapping); - fput(file); return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); From b146b289f759315fd27402a40bc15214515e6c45 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 7 Oct 2025 19:12:31 +0000 Subject: [PATCH 052/260] KVM: selftests: Don't fall over in mmu_stress_test when only one CPU is present Running mmu_stress_test on a system with only one CPU is not a recipe for success. However, there's no clear-cut reason why it absolutely shouldn't work, so the test shouldn't completely reject such a platform. At present, the *3/4 calculation will return zero on these platforms and the test fails. So, instead just skip that calculation. Suggested-by: Sean Christopherson Signed-off-by: Brendan Jackman Link: https://lore.kernel.org/r/20251007-b4-kvm-mmu-stresstest-1proc-v1-1-8c95aa0e30b6@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/mmu_stress_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index 37b7e6524533..c799e0d0694f 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -263,8 +263,10 @@ static void calc_default_nr_vcpus(void) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - nr_vcpus = CPU_COUNT(&possible_mask) * 3/4; + nr_vcpus = CPU_COUNT(&possible_mask); TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?"); + if (nr_vcpus >= 2) + nr_vcpus = nr_vcpus * 3/4; } int main(int argc, char *argv[]) From 17e5a9b77716564540d81f0c1e6082d28cf305c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:30:57 -0700 Subject: [PATCH 053/260] KVM: selftests: Forcefully override ARCH from x86_64 to x86 Forcefully override ARCH from x86_64 to x86 to handle the scenario where the user specifies ARCH=x86_64 on the command line. Fixes: 9af04539d474 ("KVM: selftests: Override ARCH for x86_64 instead of using ARCH_DIR") Cc: stable@vger.kernel.org Reported-by: David Matlack Closes: https://lore.kernel.org/all/20250724213130.3374922-1-dmatlack@google.com Link: https://lore.kernel.org/r/20251007223057.368082-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d9fffe06d3ea..f2b223072b62 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,7 @@ ARCH ?= $(SUBARCH) ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64 loongarch)) # Top-level selftests allows ARCH=x86_64 :-( ifeq ($(ARCH),x86_64) - ARCH := x86 + override ARCH := x86 endif include Makefile.kvm else From 9e4ce7a89e0b5f1d400f05533f38fe2590166ea6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:45:15 -0700 Subject: [PATCH 054/260] KVM: selftests: Use "gpa" and "gva" for local variable names in pre-fault test Rename guest_test_{phys,virt}_mem to g{p,v}a in the pre-fault memory test to shorten line lengths and to use standard terminology. Opportunsitically use "base_gva" in the guest code instead of "base_gpa" to match the host side code, which now passes in "gva" (and because referencing the virtual address avoids having to know that the data is identity mapped). No functional change intended. Cc: Yan Zhao Link: https://lore.kernel.org/r/20251007224515.374516-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/pre_fault_memory_test.c | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index f04768c1d2e4..93e603d91311 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -17,13 +17,13 @@ #define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) #define TEST_SLOT 10 -static void guest_code(uint64_t base_gpa) +static void guest_code(uint64_t base_gva) { volatile uint64_t val __used; int i; for (i = 0; i < TEST_NPAGES; i++) { - uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE); + uint64_t *src = (uint64_t *)(base_gva + i * PAGE_SIZE); val = *src; } @@ -161,6 +161,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, static void __test_pre_fault_memory(unsigned long vm_type, bool private) { + uint64_t gpa, gva, alignment, guest_page_size; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = vm_type, @@ -170,35 +171,30 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) struct kvm_vm *vm; struct ucall uc; - uint64_t guest_test_phys_mem; - uint64_t guest_test_virt_mem; - uint64_t alignment, guest_page_size; - vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code); alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; - guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size; + gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size; #ifdef __s390x__ alignment = max(0x100000UL, guest_page_size); #else alignment = SZ_2M; #endif - guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); - guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1); + gpa = align_down(gpa, alignment); + gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, TEST_SLOT, TEST_NPAGES, - private ? KVM_MEM_GUEST_MEMFD : 0); - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT, + TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, gva, gpa, TEST_NPAGES); if (private) - vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); + vm_mem_set_private(vm, gpa, TEST_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem, 0, SZ_2M, 0, private); - pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); - pre_fault_memory(vcpu, guest_test_phys_mem, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private); + pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); - vcpu_args_set(vcpu, 1, guest_test_virt_mem); + vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu); run = vcpu->run; From 83e0e12219a402bf7b8fdef067e51f945a92fd26 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:36:25 -0700 Subject: [PATCH 055/260] KVM: selftests: Rename "guest_paddr" variables to "gpa" Rename "guest_paddr" variables in vm_userspace_mem_region_add() and vm_mem_add() to KVM's de facto standard "gpa", both for consistency and to shorten line lengths. Opportunistically fix the indentation of the vm_userspace_mem_region_add() declaration. Link: https://patch.msgid.link/20251007223625.369939-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/kvm_util.h | 10 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 46 +++++++++---------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index af52cd938b50..c2481be61434 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -673,12 +673,12 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag uint32_t guest_memfd, uint64_t guest_memfd_offset); void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); + enum vm_mem_backing_src_type src_type, + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags); void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd_fd, uint64_t guest_memfd_offset); #ifndef vm_arch_has_protected_memory static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 5744643d9ec3..a334f6ed50d6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -943,8 +943,8 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; @@ -958,30 +958,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); - TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" - " guest_paddr: 0x%lx vm->page_size: 0x%x", - guest_paddr, vm->page_size); - TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + " gpa: 0x%lx vm->page_size: 0x%x", + gpa, vm->page_size); + TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1) <= vm->max_gfn, "Physical range beyond maximum " "supported physical address,\n" - " guest_paddr: 0x%lx npages: 0x%lx\n" + " gpa: 0x%lx npages: 0x%lx\n" " vm->max_gfn: 0x%lx vm->page_size: 0x%x", - guest_paddr, npages, vm->max_gfn, vm->page_size); + gpa, npages, vm->max_gfn, vm->page_size); /* * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + vm, gpa, (gpa + npages * vm->page_size) - 1); if (region != NULL) TEST_FAIL("overlapping userspace_mem_region already " "exists\n" - " requested guest_paddr: 0x%lx npages: 0x%lx " - "page_size: 0x%x\n" - " existing guest_paddr: 0x%lx size: 0x%lx", - guest_paddr, npages, vm->page_size, + " requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n" + " existing gpa: 0x%lx size: 0x%lx", + gpa, npages, vm->page_size, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); @@ -995,8 +994,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", - slot, guest_paddr, npages, - region->region.slot, + slot, gpa, npages, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); } @@ -1022,7 +1020,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); - TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); + TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) @@ -1082,20 +1080,18 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->unused_phy_pages = sparsebit_alloc(); if (vm_arch_has_protected_memory(vm)) region->protected_phy_pages = sparsebit_alloc(); - sparsebit_set_num(region->unused_phy_pages, - guest_paddr >> vm->page_shift, npages); + sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages); region->region.slot = slot; region->region.flags = flags; - region->region.guest_phys_addr = guest_paddr; + region->region.guest_phys_addr = gpa; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", - ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size, + " guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d", + ret, errno, slot, flags, gpa, region->region.memory_size, region->region.guest_memfd); /* Add to quick lookup data structures */ @@ -1117,10 +1113,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, - uint64_t npages, uint32_t flags) + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags) { - vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); + vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0); } /* From 65a70164aba7c5d3b37a2d1e04a8d19c9d980994 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 11:50:03 -0700 Subject: [PATCH 056/260] KVM: x86: Add a helper to dedup reporting of unhandled VM-Exits Add and use a helper, kvm_prepare_unexpected_reason_exit(), to dedup the code that fills the exit reason and CPU when KVM encounters a VM-Exit that KVM doesn't know how to handle. Reviewed-by: yaoyuan@linux.alibaba.com Reviewed-by: Yao Yuan Reviewed-by: Yosry Ahmed Reviewed-by: Pankaj Gupta Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Acked-by: Kai Huang Link: https://patch.msgid.link/20251030185004.3372256-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 7 +------ arch/x86/kvm/vmx/tdx.c | 6 +----- arch/x86/kvm/vmx/vmx.c | 9 +-------- arch/x86/kvm/x86.c | 12 ++++++++++++ 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..4fbe4b7ce1da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2167,6 +2167,7 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..59d896322855 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3446,13 +3446,8 @@ static bool svm_check_exit_valid(u64 exit_code) static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) { - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); return 0; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..67c190ce8104 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2145,11 +2145,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) } unhandled_exit: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vp_enter_ret; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..d98107a7bdaa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6623,15 +6623,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason.full); dump_vmcs(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_reason.full; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..c826cd05228a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9110,6 +9110,18 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) +{ + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; From 6422060aa9c7bb2039b23948db5d4e8194036657 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 22 Oct 2025 00:43:45 +1300 Subject: [PATCH 057/260] KVM: x86/mmu: Move the misplaced export of kvm_zap_gfn_range() Currently, the export of kvm_zap_gfn_range() is misplaced, i.e., it's not placed right after the kvm_zap_gfn_range() function body but after kvm_mmu_zap_collapsible_spte(). Move it to the right place. No functional change intended. Signed-off-by: Kai Huang Link: https://patch.msgid.link/20251021114345.159372-1-kai.huang@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18d69d48bc55..329cf3508f46 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6863,6 +6863,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -7204,7 +7205,6 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, return need_tlb_flush; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:24 -0700 Subject: [PATCH 058/260] KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in anticipation of using the API on x86. Once x86 uses the API, providing a stub for one architecture and having all other architectures opt-in requires more code than simply implementing the API in the lone holdout. Eliminating the Kconfig will also reduce churn if the API is renamed in the future (spoiler alert). No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 ++++++ arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 10 ---------- virt/kvm/Kconfig | 3 --- 9 files changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..ef5bf57f79b7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..ca5ba2caf314 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,6 +7240,12 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..7186b2ae4b57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:25 -0700 Subject: [PATCH 059/260] KVM: Rename kvm_arch_vcpu_async_ioctl() to kvm_arch_vcpu_unlocked_ioctl() Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's TDX code doesn't result in a massive misnomer. To avoid having to retry SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and acquiring all of those locks after/inside the current vCPU's mutex is a non-starter. However, TDX also needs to acquire the vCPU's mutex and load the vCPU, i.e. the handling is very much not async to the vCPU. No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 4 ++-- arch/loongarch/kvm/vcpu.c | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/riscv/kvm/vcpu.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 9 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ef5bf57f79b7..cf23f6b07ec7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,8 +1835,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..9a5844e85fd3 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1471,8 +1471,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..a4bd6077eecc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca5ba2caf314..b85cb213a336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,8 +7240,8 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7186b2ae4b57..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..b7db1d5f71a8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; From 5294a4b93e07ab74ef334434b927a5a33aa0ecfe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:26 -0700 Subject: [PATCH 060/260] KVM: TDX: Drop PROVE_MMU=y sanity check on to-be-populated mappings Drop TDX's sanity check that a mirror EPT mapping isn't zapped between creating said mapping and doing TDH.MEM.PAGE.ADD, as the check is simultaneously superfluous and incomplete. Per commit 2608f1057601 ("KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU"), the justification for introducing kvm_tdp_mmu_gpa_is_mapped() was to check that the target gfn was pre-populated, with a link that points to this snippet: : > One small question: : > : > What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre- : > populated? If we want to make KVM_TDX_INIT_MEM_REGION work with these regions, : > then we still need to do the real map. Or we can make KVM_TDX_INIT_MEM_REGION : > return error when it finds the region hasn't been pre-populated? : : Return an error. I don't love the idea of bleeding so many TDX details into : userspace, but I'm pretty sure that ship sailed a long, long time ago. But that justification makes little sense for the final code, as the check on nr_premapped after TDH.MEM.PAGE.ADD will detect and return an error if KVM attempted to zap a S-EPT entry (tdx_sept_zap_private_spte() will fail on TDH.MEM.RANGE.BLOCK due lack of a valid S-EPT entry). And as evidenced by the "is mapped?" code being guarded with CONFIG_KVM_PROVE_MMU=y, KVM is NOT relying on the check for general correctness. The sanity check is also incomplete in the sense that mmu_lock is dropped between the check and TDH.MEM.PAGE.ADD, i.e. will only detect KVM bugs that zap SPTEs in a very specific window (note, this also applies to the check on nr_premapped). Removing the sanity check will allow removing kvm_tdp_mmu_gpa_is_mapped(), which has no business being exposed to vendor code, and more importantly will pave the way for eliminating the "pre-map" approach entirely in favor of doing TDH.MEM.PAGE.ADD under mmu_lock. Reviewed-by: Ira Weiny Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..e59705da2779 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3191,20 +3191,6 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret < 0) goto out; - /* - * The private mem cannot be zapped after kvm_tdp_map_page() - * because all paths are covered by slots_lock and the - * filemap invalidate lock. Check that they are indeed enough. - */ - if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { - scoped_guard(read_lock, &kvm->mmu_lock) { - if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { - ret = -EIO; - goto out; - } - } - } - ret = 0; err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), src_page, &entry, &level_state); From 3ab3283dbb2c1246cbabdba8f3c52e72a3d88121 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:27 -0700 Subject: [PATCH 061/260] KVM: x86/mmu: Add dedicated API to map guest_memfd pfn into TDP MMU Add and use a new API for mapping a private pfn from guest_memfd into the TDP MMU from TDX's post-populate hook instead of partially open-coding the functionality into the TDX code. Sharing code with the pre-fault path sounded good on paper, but it's fatally flawed as simulating a fault loses the pfn, and calling back into gmem to re-retrieve the pfn creates locking problems, e.g. kvm_gmem_populate() already holds the gmem invalidation lock. Providing a dedicated API will also removing several MMU exports that ideally would not be exposed outside of the MMU, let alone to vendor code. On that topic, opportunistically drop the kvm_mmu_load() export. Leave kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future. Gate the API on CONFIG_KVM_GUEST_MEMFD=y as private memory _must_ be backed by guest_memfd. Add a lockdep-only assert to that the incoming pfn is indeed backed by guest_memfd, and that the gmem instance's invalidate lock is held (which, combined with slots_lock being held, obviates the need to check for a stale "fault"). Cc: Michael Roth Cc: Yan Zhao Cc: Ira Weiny Cc: Vishal Annapurve Cc: Rick Edgecombe Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu/mmu.c | 81 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 10 ++---- 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f63074048ec6..2f108e381959 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled; bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5..d24fa59f872b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, return min(range->size, end - range->gpa); } +#ifdef CONFIG_KVM_GUEST_MEMFD +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot) +{ +#ifdef CONFIG_PROVE_LOCKING + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) || + WARN_ON_ONCE(!slot->gmem.file) || + WARN_ON_ONCE(!file_count(slot->gmem.file))) + return; + + lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock); +#endif +} + +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +{ + struct kvm_page_fault fault = { + .addr = gfn_to_gpa(gfn), + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS, + .prefetch = true, + .is_tdp = true, + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), + + .max_level = PG_LEVEL_4K, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + .is_private = true, + + .gfn = gfn, + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), + .pfn = pfn, + .map_writable = true, + }; + struct kvm *kvm = vcpu->kvm; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + /* + * Mapping a pre-determined private pfn is intended only for use when + * populating a guest_memfd instance. Assert that the slot is backed + * by guest_memfd and that the gmem instance's invalidate_lock is held. + */ + kvm_assert_gmem_invalidate_lock_held(fault.slot); + + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm)) + return -EIO; + + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn)) + return -EPERM; + + r = kvm_mmu_reload(vcpu); + if (r) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + + guard(read_lock)(&kvm->mmu_lock); + + r = kvm_tdp_mmu_map(vcpu, &fault); + } while (r == RET_PF_RETRY); + + if (r != RET_PF_FIXED) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn); +#endif + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index e59705da2779..4e712e11e00c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3167,15 +3167,12 @@ struct tdx_gmem_post_populate_arg { static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *_arg) { - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct tdx_gmem_post_populate_arg *arg = _arg; - struct kvm_vcpu *vcpu = arg->vcpu; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - u8 level = PG_LEVEL_4K; struct page *src_page; int ret, i; - u64 err, entry, level_state; /* * Get the source page if it has been faulted in. Return failure if the @@ -3187,7 +3184,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret != 1) return -ENOMEM; - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); if (ret < 0) goto out; @@ -3250,7 +3247,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) return -EINVAL; - kvm_mmu_reload(vcpu); ret = 0; while (region.nr_pages) { if (signal_pending(current)) { From c1f173fb3389d39f78fc57f14b767e7127bc3908 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:28 -0700 Subject: [PATCH 062/260] KVM: x86/mmu: WARN if KVM attempts to map into an invalid TDP MMU root When mapping into the TDP MMU, WARN (if KVM_PROVE_MMU=y) if the root is invalid, e.g. if KVM is attempting to insert a mapping without checking if the information and MMU context is fresh. Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c5734ca5c17d..440fd8f80397 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1273,6 +1273,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu_page *sp; int ret = RET_PF_RETRY; + KVM_MMU_WARN_ON(!root || root->role.invalid); + kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); From fe7413e39810235fc812d5d85053cb1957388090 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:29 -0700 Subject: [PATCH 063/260] Revert "KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU" Remove the helper and exports that were added to allow TDX code to reuse kvm_tdp_map_page() for its gmem post-populate flow now that a dedicated TDP MMU API is provided to install a mapping given a gfn+pfn pair. This reverts commit 2608f105760115e94a03efd9f12f8fbfd1f9af4b. Reviewed-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 2 -- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++-------------------------------- 3 files changed, 7 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2f108e381959..9e5045a60d8b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -257,8 +257,6 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d24fa59f872b..559c80c841b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) { int r; @@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 440fd8f80397..e735d2f8367b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1941,13 +1941,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - struct kvm_mmu_page *root) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; + *root_level = vcpu->arch.mmu->root_role.level; + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1956,36 +1959,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) -{ - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); - *root_level = vcpu->arch.mmu->root_role.level; - - return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); -} - -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) -{ - struct kvm *kvm = vcpu->kvm; - bool is_direct = kvm_is_addr_direct(kvm, gpa); - hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : - vcpu->arch.mmu->mirror_root_hpa; - u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; - int leaf; - - lockdep_assert_held(&kvm->mmu_lock); - rcu_read_lock(); - leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); - rcu_read_unlock(); - if (leaf < 0) - return false; - - spte = sptes[leaf]; - return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no From 6de2fb089bb27c2c03ca838f2478a659c826d686 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:30 -0700 Subject: [PATCH 064/260] KVM: x86/mmu: Rename kvm_tdp_map_page() to kvm_tdp_page_prefault() Rename kvm_tdp_map_page() to kvm_tdp_page_prefault() now that it's used only by kvm_arch_vcpu_pre_fault_memory(). No functional change intended. Reviewed-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 559c80c841b9..f5fb12ea04c1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4924,8 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 error_code, u8 *level) { int r; @@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; From ce7b5695397b54677ed827e0d52058daaecad2a1 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Thu, 30 Oct 2025 13:09:31 -0700 Subject: [PATCH 065/260] KVM: TDX: Drop superfluous page pinning in S-EPT management Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd doesn't support page migration in any capacity, i.e. there are no migrate callbacks because guest_memfd pages *can't* be migrated. See the WARN in kvm_gmem_migrate_folio(). Eliminating TDX's explicit pinning will also enable guest_memfd to support in-place conversion between shared and private memory[1][2]. Because KVM cannot distinguish between speculative/transient refcounts and the intentional refcount for TDX on private pages[3], failing to release private page refcount in TDX could cause guest_memfd to indefinitely wait on decreasing the refcount for the splitting. Under normal conditions, not holding an extra page refcount in TDX is safe because guest_memfd ensures pages are retained until its invalidation notification to KVM MMU is completed. However, if there're bugs in KVM/TDX module, not holding an extra refcount when a page is mapped in S-EPT could result in a page being released from guest_memfd while still mapped in the S-EPT. But, doing work to make a fatal error slightly less fatal is a net negative when that extra work adds complexity and confusion. Several approaches were considered to address the refcount issue, including - Attempting to modify the KVM unmap operation to return a failure, which was deemed too complex and potentially incorrect[4]. - Increasing the folio reference count only upon S-EPT zapping failure[5]. - Use page flags or page_ext to indicate a page is still used by TDX[6], which does not work for HVO (HugeTLB Vmemmap Optimization). - Setting HWPOISON bit or leveraging folio_set_hugetlb_hwpoison()[7]. Due to the complexity or inappropriateness of these approaches, and the fact that S-EPT zapping failure is currently only possible when there are bugs in the KVM or TDX module, which is very rare in a production kernel, a straightforward approach of simply not holding the page reference count in TDX was chosen[8]. When S-EPT zapping errors occur, KVM_BUG_ON() is invoked to kick off all vCPUs and mark the VM as dead. Although there is a potential window that a private page mapped in the S-EPT could be reallocated and used outside the VM, the loud warning from KVM_BUG_ON() should provide sufficient debug information. To be robust against bugs, the user can enable panic_on_warn as normal. Link: https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com [1] Link: https://youtu.be/UnBKahkAon4 [2] Link: https://lore.kernel.org/all/CAGtprH_ypohFy9TOJ8Emm_roT4XbQUtLKZNFcM6Fr+fhTFkE0Q@mail.gmail.com [3] Link: https://lore.kernel.org/all/aEEEJbTzlncbRaRA@yzhao56-desk.sh.intel.com [4] Link: https://lore.kernel.org/all/aE%2Fq9VKkmaCcuwpU@yzhao56-desk.sh.intel.com [5] Link: https://lore.kernel.org/all/aFkeBtuNBN1RrDAJ@yzhao56-desk.sh.intel.com [6] Link: https://lore.kernel.org/all/diqzy0tikran.fsf@ackerleytng-ctop.c.googlers.com [7] Link: https://lore.kernel.org/all/53ea5239f8ef9d8df9af593647243c10435fd219.camel@intel.com [8] Suggested-by: Vishal Annapurve Suggested-by: Ackerley Tng Suggested-by: Rick Edgecombe Signed-off-by: Yan Zhao Reviewed-by: Ira Weiny Reviewed-by: Kai Huang [sean: extract out of hugepage series, massage changelog accordingly] Reviewed-by: Binbin Wu Reviewed-by: Rick Edgecombe Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4e712e11e00c..0935c3aaaea0 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1583,29 +1583,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } -static void tdx_unpin(struct kvm *kvm, struct page *page) -{ - put_page(page); -} - static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) + enum pg_level level, kvm_pfn_t pfn) { int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 entry, level_state; u64 err; err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); - if (unlikely(tdx_operand_busy(err))) { - tdx_unpin(kvm, page); + if (unlikely(tdx_operand_busy(err))) return -EBUSY; - } if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); - tdx_unpin(kvm, page); return -EIO; } @@ -1639,29 +1632,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EINVAL; - /* - * Because guest_memfd doesn't support page migration with - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page - * migration. Until guest_memfd supports page migration, prevent page - * migration. - * TODO: Once guest_memfd introduces callback on page migration, - * implement it and remove get_page/put_page(). - */ - get_page(page); - /* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching * barrier in tdx_td_finalize(). */ smp_rmb(); if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, page); + return tdx_mem_page_aug(kvm, gfn, level, pfn); return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); } @@ -1712,7 +1694,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, return -EIO; } tdx_quirk_reset_page(page); - tdx_unpin(kvm, page); return 0; } @@ -1792,7 +1773,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { atomic64_dec(&kvm_tdx->nr_premapped); - tdx_unpin(kvm, page); return 0; } From e6348c90dda9cb7beeda1ce83eed6177e46830d2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:32 -0700 Subject: [PATCH 066/260] KVM: TDX: Return -EIO, not -EINVAL, on a KVM_BUG_ON() condition Return -EIO when a KVM_BUG_ON() is tripped, as KVM's ABI is to return -EIO when a VM has been killed due to a KVM bug, not -EINVAL. Note, many (all?) of the affected paths never propagate the error code to userspace, i.e. this is about internal consistency more than anything else. Reviewed-by: Rick Edgecombe Reviewed-by: Ira Weiny Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0935c3aaaea0..052a086335f9 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1621,7 +1621,7 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EINVAL; + return -EIO; /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ atomic64_inc(&kvm_tdx->nr_premapped); @@ -1635,7 +1635,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; + return -EIO; /* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching @@ -1658,10 +1658,10 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; + return -EIO; if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; + return -EIO; /* * When zapping private page, write lock is held. So no race condition @@ -1846,7 +1846,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * and slot move/deletion. */ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; + return -EIO; /* * The HKID assigned to this TD was already freed and cache was @@ -1867,7 +1867,7 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EINVAL; + return -EIO; ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) From b836503300dcc0d60f7ffc359f0768cd545d8ef9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:33 -0700 Subject: [PATCH 067/260] KVM: TDX: Fold tdx_sept_drop_private_spte() into tdx_sept_remove_private_spte() Fold tdx_sept_drop_private_spte() into tdx_sept_remove_private_spte() as a step towards having "remove" be the one and only function that deals with removing/zapping/dropping a SPTE, e.g. to avoid having to differentiate between "zap", "drop", and "remove". Eliminating the "drop" helper also gets rid of what is effectively dead code due to redundant checks, e.g. on an HKID being assigned. No functional change intended. Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 90 +++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 052a086335f9..667cd089eec8 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1648,55 +1648,6 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); } -static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn); - u64 err, entry, level_state; - - /* TODO: handle large pages. */ - if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EIO; - - if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) - return -EIO; - - /* - * When zapping private page, write lock is held. So no race condition - * with other vcpu sept operation. - * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. - */ - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* - * The second retry is expected to succeed after kicking off all - * other vCPUs and prevent them from invoking TDH.VP.ENTER. - */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); - return -EIO; - } - - err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); - return -EIO; - } - tdx_quirk_reset_page(page); - return 0; -} - static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *private_spt) { @@ -1858,7 +1809,11 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct page *page = pfn_to_page(pfn); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; int ret; /* @@ -1869,6 +1824,10 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) return -EIO; + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EIO; + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) return ret; @@ -1879,7 +1838,38 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, */ tdx_track(kvm); - return tdx_sept_drop_private_spte(kvm, gfn, level, page); + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* + * The second retry is expected to succeed after kicking off all + * other vCPUs and prevent them from invoking TDH.VP.ENTER. + */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + + tdx_quirk_reset_page(page); + return 0; } void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, From 7139c860650535cf517a9344212ce56a21ba37f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:34 -0700 Subject: [PATCH 068/260] KVM: x86/mmu: Drop the return code from kvm_x86_ops.remove_external_spte() Drop the return code from kvm_x86_ops.remove_external_spte(), a.k.a. tdx_sept_remove_private_spte(), as KVM simply does a KVM_BUG_ON() failure, and that KVM_BUG_ON() is redundant since all error paths in TDX also do a KVM_BUG_ON(). Opportunistically pass the spte instead of the pfn, as the API is clearly about removing an spte. Suggested-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++------ arch/x86/kvm/vmx/tdx.c | 17 ++++++++--------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..b5867f8fe6ce 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1855,8 +1855,8 @@ struct kvm_x86_ops { void *external_spt); /* Update external page table from spte getting removed, and flush TLB. */ - int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + u64 mirror_spte); bool (*has_wbinvd_exit)(void); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e735d2f8367b..e1a96e9ea1bb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, int level) { - kvm_pfn_t old_pfn = spte_to_pfn(old_spte); - int ret; - /* * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external * PTs are removed in a special order, involving free_external_spt(). @@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, /* Zapping leaf spte is allowed only when write lock is held. */ lockdep_assert_held_write(&kvm->mmu_lock); - /* Because write lock is held, operation should success. */ - ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); - KVM_BUG_ON(ret, kvm); + + kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte); } /** diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 667cd089eec8..247c35164565 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1806,12 +1806,12 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, return tdx_reclaim_page(virt_to_page(private_spt)); } -static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) +static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, u64 mirror_spte) { + struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 err, entry, level_state; int ret; @@ -1822,15 +1822,15 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EIO; + return; /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EIO; + return; ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) - return ret; + return; /* * TDX requires TLB tracking before dropping private page. Do @@ -1859,17 +1859,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); - return -EIO; + return; } err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); if (KVM_BUG_ON(err, kvm)) { pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); - return -EIO; + return; } tdx_quirk_reset_page(page); - return 0; } void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, From b9d5cf6de0b6bd3bdd8957b5fea5a243e43c928f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:35 -0700 Subject: [PATCH 069/260] KVM: TDX: WARN if mirror SPTE doesn't have full RWX when creating S-EPT mapping Pass in the mirror_spte to kvm_x86_ops.set_external_spte() to provide symmetry with .remove_external_spte(), and assert in TDX that the mirror SPTE is shadow-present with full RWX permissions (the TDX-Module doesn't allow the hypervisor to control protections). Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 3 +-- arch/x86/kvm/vmx/tdx.c | 6 +++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b5867f8fe6ce..87a5f5100b1d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1848,7 +1848,7 @@ struct kvm_x86_ops { void *external_spt); /* Update the external page table from spte getting set. */ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + u64 mirror_spte); /* Update external page tables for page table about to be freed. */ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e1a96e9ea1bb..9c26038f6b77 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -515,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool is_leaf = is_present && is_last_spte(new_spte, level); - kvm_pfn_t new_pfn = spte_to_pfn(new_spte); int ret = 0; KVM_BUG_ON(was_present, kvm); @@ -534,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * external page table, or leaf. */ if (is_leaf) { - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte); } else { void *external_spt = get_external_spt(gfn, new_spte, level); diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 247c35164565..1315fd9fdd6e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1629,14 +1629,18 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, } static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) + enum pg_level level, u64 mirror_spte) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + kvm_pfn_t pfn = spte_to_pfn(mirror_spte); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EIO; + WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || + (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); + /* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching * barrier in tdx_td_finalize(). From 24adff39705223f06607926f2717e6f8dec12d28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:36 -0700 Subject: [PATCH 070/260] KVM: TDX: Avoid a double-KVM_BUG_ON() in tdx_sept_zap_private_spte() Return -EIO immediately from tdx_sept_zap_private_spte() if the number of to-be-added pages underflows, so that the following "KVM_BUG_ON(err, kvm)" isn't also triggered. Isolating the check from the "is premap error" if-statement will also allow adding a lockdep assertion that premap errors are encountered if and only if slots_lock is held. Reviewed-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 1315fd9fdd6e..4426146d403e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1725,8 +1725,10 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); tdx_no_vcpus_enter_stop(kvm); } - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { + if (KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + return -EIO; + atomic64_dec(&kvm_tdx->nr_premapped); return 0; } From af96d5452e5e88019bed9c081d4be1c489d7c081 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:37 -0700 Subject: [PATCH 071/260] KVM: TDX: Use atomic64_dec_return() instead of a poor equivalent Use atomic64_dec_return() when decrementing the number of "pre-mapped" S-EPT pages to ensure that the count can't go negative without KVM noticing. In theory, checking for '0' and then decrementing in a separate operation could miss a 0=>-1 transition. In practice, such a condition is impossible because nr_premapped is protected by slots_lock, i.e. doesn't actually need to be an atomic (that wart will be addressed shortly). Don't bother trying to keep the count non-negative, as the KVM_BUG_ON() ensures the VM is dead, i.e. there's no point in trying to limp along. Reviewed-by: Rick Edgecombe Reviewed-by: Ira Weiny Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4426146d403e..8482eadbbb8f 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1726,10 +1726,9 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, tdx_no_vcpus_enter_stop(kvm); } if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { - if (KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + if (KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm)) return -EIO; - atomic64_dec(&kvm_tdx->nr_premapped); return 0; } @@ -3171,8 +3170,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) - atomic64_dec(&kvm_tdx->nr_premapped); + KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm); if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { From b4b2b6eda5afa92f42fd1cd67f2729acf7e95f5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:38 -0700 Subject: [PATCH 072/260] KVM: TDX: Fold tdx_mem_page_record_premap_cnt() into its sole caller Fold tdx_mem_page_record_premap_cnt() into tdx_sept_set_private_spte() as providing a one-off helper for effectively three lines of code is at best a wash, and splitting the code makes the comment for smp_rmb() _extremely_ confusing as the comment talks about reading kvm->arch.pre_fault_allowed before kvm_tdx->state, but the immediately visible code does the exact opposite. Opportunistically rewrite the comments to more explicitly explain who is checking what, as well as _why_ the ordering matters. No functional change intended. Reviewed-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 49 ++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 8482eadbbb8f..addd3678d03d 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1605,29 +1605,6 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, return 0; } -/* - * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the - * callback tdx_gmem_post_populate() then maps pages into private memory. - * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the - * private EPT structures for the page to have been built before, which is - * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that - * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). - * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there - * are no half-initialized shared EPT pages. - */ -static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EIO; - - /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ - atomic64_inc(&kvm_tdx->nr_premapped); - return 0; -} - static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, u64 mirror_spte) { @@ -1642,14 +1619,30 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); /* - * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching - * barrier in tdx_td_finalize(). + * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() + * before kvm_tdx->state. Userspace must not be allowed to pre-fault + * arbitrary memory until the initial memory image is finalized. Pairs + * with the smp_wmb() in tdx_td_finalize(). */ smp_rmb(); - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, pfn); - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); + /* + * If the TD isn't finalized/runnable, then userspace is initializing + * the VM image via KVM_TDX_INIT_MEM_REGION. Increment the number of + * pages that need to be mapped and initialized via TDH.MEM.PAGE.ADD. + * KVM_TDX_FINALIZE_VM checks the counter to ensure all pre-mapped + * pages have been added to the image, to prevent running the TD with a + * valid mapping in the mirror EPT, but not in the S-EPT. + */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) { + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EIO; + + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; + } + + return tdx_mem_page_aug(kvm, gfn, level, pfn); } static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, From 6b5b71ffabf9b0eb250ae6d4e0ab7fbac622f0f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:39 -0700 Subject: [PATCH 073/260] KVM: TDX: ADD pages to the TD image while populating mirror EPT entries When populating the initial memory image for a TDX guest, ADD pages to the TD as part of establishing the mappings in the mirror EPT, as opposed to creating the mappings and then doing ADD after the fact. Doing ADD in the S-EPT callbacks eliminates the need to track "premapped" pages, as the mirror EPT (M-EPT) and S-EPT are always synchronized, e.g. if ADD fails, KVM reverts to the previous M-EPT entry (guaranteed to be !PRESENT). Eliminating the hole where the M-EPT can have a mapping that doesn't exist in the S-EPT in turn obviates the need to handle errors that are unique to encountering a missing S-EPT entry (see tdx_is_sept_zap_err_due_to_premap()). Keeping the M-EPT and S-EPT synchronized also eliminates the need to check for unconsumed "premap" entries during tdx_td_finalize(), as there simply can't be any such entries. Dropping that check in particular reduces the overall cognitive load, as the management of nr_premapped with respect to removal of S-EPT is _very_ subtle. E.g. successful removal of an S-EPT entry after it completed ADD doesn't adjust nr_premapped, but it's not clear why that's "ok" but having half-baked entries is not (it's not truly "ok" in that removing pages from the image will likely prevent the guest from booting, but from KVM's perspective it's "ok"). Doing ADD in the S-EPT path requires passing an argument via a scratch field, but the current approach of tracking the number of "premapped" pages effectively does the same. And the "premapped" counter is much more dangerous, as it doesn't have a singular lock to protect its usage, since nr_premapped can be modified as soon as mmu_lock is dropped, at least in theory. I.e. nr_premapped is guarded by slots_lock, but only for "happy" paths. Note, this approach was used/tried at various points in TDX development, but was ultimately discarded due to a desire to avoid stashing temporary state in kvm_tdx. But as above, KVM ended up with such state anyways, and fully committing to using temporary state provides better access rules (100% guarded by slots_lock), and makes several edge cases flat out impossible. Note #2, continue to extend the measurement outside of mmu_lock, as it's a slow operation (typically 16 SEAMCALLs per page whose data is included in the measurement), and doesn't *need* to be done under mmu_lock, e.g. for consistency purposes. However, MR.EXTEND isn't _that_ slow, e.g. ~1ms latency to measure a full page, so if it needs to be done under mmu_lock in the future, e.g. because KVM gains a flow that can remove S-EPT entries during KVM_TDX_INIT_MEM_REGION, then extending the measurement can also be moved into the S-EPT mapping path (again, only if absolutely necessary). P.S. _If_ MR.EXTEND is moved into the S-EPT path, take care not to return an error up the stack if TDH_MR_EXTEND fails, as removing the M-EPT entry but not the S-EPT entry would result in inconsistent state! Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 106 ++++++++++++++--------------------------- arch/x86/kvm/vmx/tdx.h | 8 +++- 2 files changed, 43 insertions(+), 71 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index addd3678d03d..14cadd0d71c0 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1583,6 +1583,32 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; + gpa_t gpa = gfn_to_gpa(gfn); + + lockdep_assert_held(&kvm->slots_lock); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || + KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) + return -EIO; + + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + kvm_tdx->page_add_src, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { @@ -1628,19 +1654,10 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, /* * If the TD isn't finalized/runnable, then userspace is initializing - * the VM image via KVM_TDX_INIT_MEM_REGION. Increment the number of - * pages that need to be mapped and initialized via TDH.MEM.PAGE.ADD. - * KVM_TDX_FINALIZE_VM checks the counter to ensure all pre-mapped - * pages have been added to the image, to prevent running the TD with a - * valid mapping in the mirror EPT, but not in the S-EPT. + * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. */ - if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) { - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EIO; - - atomic64_inc(&kvm_tdx->nr_premapped); - return 0; - } + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return tdx_mem_page_add(kvm, gfn, level, pfn); return tdx_mem_page_aug(kvm, gfn, level, pfn); } @@ -1666,39 +1683,6 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, return 0; } -/* - * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is - * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called - * successfully. - * - * Since tdh_mem_sept_add() must have been invoked successfully before a - * non-leaf entry present in the mirrored page table, the SEPT ZAP related - * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead - * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the - * SEPT. - * - * Further check if the returned entry from SEPT walking is with RWX permissions - * to filter out anything unexpected. - * - * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from - * level_state returned from a SEAMCALL error is the same as that passed into - * the SEAMCALL. - */ -static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, - u64 entry, int level) -{ - if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) - return false; - - if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) - return false; - - if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) - return false; - - return true; -} - static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, struct page *page) { @@ -1718,12 +1702,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); tdx_no_vcpus_enter_stop(kvm); } - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { - if (KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm)) - return -EIO; - - return 0; - } if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); @@ -2839,12 +2817,6 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; - /* - * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue - * TDH.MEM.PAGE.ADD(). - */ - if (atomic64_read(&kvm_tdx->nr_premapped)) - return -EINVAL; cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); if (tdx_operand_busy(cmd->hw_error)) @@ -3141,6 +3113,9 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, struct page *src_page; int ret, i; + if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) + return -EIO; + /* * Get the source page if it has been faulted in. Return failure if the * source page has been swapped out or unmapped in primary memory. @@ -3151,19 +3126,14 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret != 1) return -ENOMEM; + kvm_tdx->page_add_src = src_page; ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); - if (ret < 0) - goto out; + kvm_tdx->page_add_src = NULL; - ret = 0; - err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), - src_page, &entry, &level_state); - if (err) { - ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; - goto out; - } + put_page(src_page); - KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm); + if (ret) + return ret; if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { @@ -3176,8 +3146,6 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, } } -out: - put_page(src_page); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ca39a9391db1..1239d9845093 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -36,8 +36,12 @@ struct kvm_tdx { struct tdx_td td; - /* For KVM_TDX_INIT_MEM_REGION. */ - atomic64_t nr_premapped; + /* + * Scratch pointer used to pass the source page to tdx_mem_page_add(). + * Protected by slots_lock, and non-NULL only when mapping a private + * pfn via tdx_gmem_post_populate(). + */ + struct page *page_add_src; /* * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do From 14c9938619bee7743eb2f2443b7d5f412a80838b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:40 -0700 Subject: [PATCH 074/260] KVM: TDX: Fold tdx_sept_zap_private_spte() into tdx_sept_remove_private_spte() Do TDH_MEM_RANGE_BLOCK directly in tdx_sept_remove_private_spte() instead of using a one-off helper now that the nr_premapped tracking is gone. Opportunistically drop the WARN on hugepages, which was dead code (see the KVM_BUG_ON() in tdx_sept_remove_private_spte()). No functional change intended. Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 14cadd0d71c0..797771918b2a 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1683,33 +1683,6 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, return 0; } -static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); - u64 err, entry, level_state; - - /* For now large page isn't supported yet. */ - WARN_ON_ONCE(level != PG_LEVEL_4K); - - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); - return -EIO; - } - return 1; -} - /* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in @@ -1790,7 +1763,6 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); gpa_t gpa = gfn_to_gpa(gfn); u64 err, entry, level_state; - int ret; /* * HKID is released after all private pages have been removed, and set @@ -1804,9 +1776,18 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return; - ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); - if (ret <= 0) + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); return; + } /* * TDX requires TLB tracking before dropping private page. Do From 597d7068702fc3120b6a37bb8c5cd6c1bab898e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:41 -0700 Subject: [PATCH 075/260] KVM: TDX: Combine KVM_BUG_ON + pr_tdx_error() into TDX_BUG_ON() Add TDX_BUG_ON() macros (with varying numbers of arguments) to deduplicate the myriad flows that do KVM_BUG_ON()/WARN_ON_ONCE() followed by a call to pr_tdx_error(). In addition to reducing boilerplate copy+paste code, this also helps ensure that KVM provides consistent handling of SEAMCALL errors. Opportunistically convert a handful of bare WARN_ON_ONCE() paths to the equivalent of KVM_BUG_ON(), i.e. have them terminate the VM. If a SEAMCALL error is fatal enough to WARN on, it's fatal enough to terminate the TD. Reviewed-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 110 +++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 797771918b2a..5a30d77375cc 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -24,20 +24,32 @@ #undef pr_fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define pr_tdx_error(__fn, __err) \ - pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) +#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ +({ \ + struct kvm *_kvm = (__kvm); \ + bool __ret = !!(__err); \ + \ + if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ + if (_kvm) \ + kvm_vm_bugged(_kvm); \ + pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ + __err, __args); \ + } \ + unlikely(__ret); \ +}) -#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ - pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) +#define TDX_BUG_ON(__err, __fn, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") -#define pr_tdx_error_1(__fn, __err, __rcx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) +#define TDX_BUG_ON_1(__err, __fn, __rcx, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx", __rcx) -#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) +#define TDX_BUG_ON_2(__err, __fn, __rcx, __rdx, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx, rdx 0x%llx", __rcx, __rdx) + +#define TDX_BUG_ON_3(__err, __fn, __rcx, __rdx, __r8, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx, rdx 0x%llx, r8 0x%llx", __rcx, __rdx, __r8) -#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) bool enable_tdx __ro_after_init; module_param_named(tdx, enable_tdx, bool, 0444); @@ -313,10 +325,9 @@ static int __tdx_reclaim_page(struct page *page) * before the HKID is released and control pages have also been * released at this point, so there is no possibility of contention. */ - if (WARN_ON_ONCE(err)) { - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) return -EIO; - } + return 0; } @@ -404,8 +415,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) return; smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); - if (KVM_BUG_ON(arg.err, vcpu->kvm)) - pr_tdx_error(TDH_VP_FLUSH, arg.err); + + TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); } void tdx_disable_virtualization_cpu(void) @@ -464,8 +475,7 @@ static void smp_func_do_phymem_cache_wb(void *unused) } out: - if (WARN_ON_ONCE(err)) - pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); + TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); } void tdx_mmu_release_hkid(struct kvm *kvm) @@ -504,8 +514,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) err = tdh_mng_vpflushdone(&kvm_tdx->td); if (err == TDX_FLUSHVP_NOT_DONE) goto out; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", kvm_tdx->hkid); goto out; @@ -528,8 +537,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) * tdh_mng_key_freeid() will fail. */ err = tdh_mng_key_freeid(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_KEY_FREEID, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", kvm_tdx->hkid); } else { @@ -580,10 +588,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) * when it is reclaiming TDCS). */ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) return; - } + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); @@ -606,11 +613,8 @@ static int tdx_do_tdh_mng_key_config(void *param) /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ err = tdh_mng_key_config(&kvm_tdx->td); - - if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { - pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) return -EIO; - } return 0; } @@ -1601,10 +1605,8 @@ static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, if (unlikely(tdx_operand_busy(err))) return -EBUSY; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_ADD, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) return -EIO; - } return 0; } @@ -1623,10 +1625,8 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, if (unlikely(tdx_operand_busy(err))) return -EBUSY; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) return -EIO; - } return 0; } @@ -1675,10 +1675,8 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, if (unlikely(tdx_operand_busy(err))) return -EBUSY; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) return -EIO; - } return 0; } @@ -1726,8 +1724,7 @@ static void tdx_track(struct kvm *kvm) tdx_no_vcpus_enter_stop(kvm); } - if (KVM_BUG_ON(err, kvm)) - pr_tdx_error(TDH_MEM_TRACK, err); + TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } @@ -1784,10 +1781,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, tdx_no_vcpus_enter_stop(kvm); } - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) return; - } /* * TDX requires TLB tracking before dropping private page. Do @@ -1814,16 +1809,12 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, tdx_no_vcpus_enter_stop(kvm); } - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) return; - } err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) return; - } tdx_quirk_reset_page(page); } @@ -2463,8 +2454,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, goto free_packages; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_CREATE, err); + if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { ret = -EIO; goto free_packages; } @@ -2505,8 +2495,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, ret = -EAGAIN; goto teardown; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_ADDCX, err); + if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { ret = -EIO; goto teardown; } @@ -2523,8 +2512,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, *seamcall_err = err; ret = -EINVAL; goto teardown; - } else if (WARN_ON_ONCE(err)) { - pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { ret = -EIO; goto teardown; } @@ -2802,10 +2790,8 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); if (tdx_operand_busy(cmd->hw_error)) return -EBUSY; - if (KVM_BUG_ON(cmd->hw_error, kvm)) { - pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) return -EIO; - } kvm_tdx->state = TD_STATE_RUNNABLE; /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ @@ -2892,16 +2878,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); - if (KVM_BUG_ON(err, vcpu->kvm)) { + if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { ret = -EIO; - pr_tdx_error(TDH_VP_CREATE, err); goto free_tdcx; } for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_ADDCX, err); + if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { /* * Pages already added are reclaimed by the vcpu_free * method, but the rest are freed here. @@ -2915,10 +2899,8 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_INIT, err); + if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) return -EIO; - } vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; From 55560b6be5bc39384917ff456d1c9ba0a3790277 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:42 -0700 Subject: [PATCH 076/260] KVM: TDX: Derive error argument names from the local variable names When printing SEAMCALL errors, use the name of the variable holding an error parameter instead of the register from whence it came, so that flows which use descriptive variable names will similarly print descriptive error messages. Suggested-by: Rick Edgecombe Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5a30d77375cc..a9d1aabbefbf 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -41,14 +41,15 @@ #define TDX_BUG_ON(__err, __fn, __kvm) \ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") -#define TDX_BUG_ON_1(__err, __fn, __rcx, __kvm) \ - __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx", __rcx) +#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) -#define TDX_BUG_ON_2(__err, __fn, __rcx, __rdx, __kvm) \ - __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx, rdx 0x%llx", __rcx, __rdx) +#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) -#define TDX_BUG_ON_3(__err, __fn, __rcx, __rdx, __r8, __kvm) \ - __TDX_BUG_ON(__err, #__fn, __kvm, ", rcx 0x%llx, rdx 0x%llx, r8 0x%llx", __rcx, __rdx, __r8) +#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ + a1, a2, a3) bool enable_tdx __ro_after_init; From 2ff14116982c663066f3cdb4e2af5dfa7a812caa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:43 -0700 Subject: [PATCH 077/260] KVM: TDX: Assert that mmu_lock is held for write when removing S-EPT entries Unconditionally assert that mmu_lock is held for write when removing S-EPT entries, not just when removing S-EPT entries triggers certain conditions, e.g. needs to do TDH_MEM_TRACK or kick vCPUs out of the guest. Conditionally asserting implies that it's safe to hold mmu_lock for read when those paths aren't hit, which is simply not true, as KVM doesn't support removing S-EPT entries under read-lock. Only two paths lead to remove_external_spte(), and both paths asserts that mmu_lock is held for write (tdp_mmu_set_spte() via lockdep, and handle_removed_pt() via KVM_BUG_ON()). Deliberately leave lockdep assertions in the "no vCPUs" helpers to document that wait_for_sept_zap is guarded by holding mmu_lock for write, and keep the conditional assert in tdx_track() as well, but with a comment to help explain why holding mmu_lock for write matters (above and beyond why tdx_sept_remove_private_spte()'s requirements). Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index a9d1aabbefbf..ee17c8aacfa4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1715,6 +1715,11 @@ static void tdx_track(struct kvm *kvm) if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) return; + /* + * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest + * mode must be serialized, as TDH.MEM.TRACK will fail if the previous + * tracking epoch hasn't completed. + */ lockdep_assert_held_write(&kvm->mmu_lock); err = tdh_mem_track(&kvm_tdx->td); @@ -1762,6 +1767,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, gpa_t gpa = gfn_to_gpa(gfn); u64 err, entry, level_state; + lockdep_assert_held_write(&kvm->mmu_lock); + /* * HKID is released after all private pages have been removed, and set * before any might be populated. Warn if zapping is attempted when From 3d626ce5a8cca64eb6d317eef4b24b70a0e5b27b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:44 -0700 Subject: [PATCH 078/260] KVM: TDX: Add macro to retry SEAMCALLs when forcing vCPUs out of guest Add a macro to handle kicking vCPUs out of the guest and retrying SEAMCALLs on TDX_OPERAND_BUSY instead of providing small helpers to be used by each SEAMCALL. Wrapping the SEAMCALLs in a macro makes it a little harder to tease out which SEAMCALL is being made, but significantly reduces the amount of copy+paste code, and makes it all but impossible to leave an elevated wait_for_sept_zap. Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 82 +++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index ee17c8aacfa4..88c85fd82a0e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -294,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_no_vcpus_enter_start(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); - - kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); -} - -static void tdx_no_vcpus_enter_stop(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); -} +/* + * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single + * retry (if necessary) after forcing vCPUs to exit and wait for the operation + * to complete. All flows that remove/block S-EPT entries run with mmu_lock + * held for write, i.e. are mutually exclusive with each other, but they aren't + * mutually exclusive with running vCPUs, and so can fail with "operand busy" + * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. + * + * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. + */ +#define tdh_do_no_vcpus(tdh_func, kvm, args...) \ +({ \ + struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ + u64 __err; \ + \ + lockdep_assert_held_write(&kvm->mmu_lock); \ + \ + __err = tdh_func(args); \ + if (unlikely(tdx_operand_busy(__err))) { \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ + \ + __err = tdh_func(args); \ + \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ + } \ + __err; \ +}) /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ static int __tdx_reclaim_page(struct page *page) @@ -1722,14 +1731,7 @@ static void tdx_track(struct kvm *kvm) */ lockdep_assert_held_write(&kvm->mmu_lock); - err = tdh_mem_track(&kvm_tdx->td); - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_track(&kvm_tdx->td); - tdx_no_vcpus_enter_stop(kvm); - } - + err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); @@ -1781,14 +1783,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return; - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - + err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) return; @@ -1803,20 +1799,8 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * with other vcpu sept operation. * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. */ - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* - * The second retry is expected to succeed after kicking off all - * other vCPUs and prevent them from invoking TDH.VP.ENTER. - */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - + err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) return; From 59d5c1ed6df222791ab7263e5a0c95eea9d83363 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:45 -0700 Subject: [PATCH 079/260] KVM: TDX: Add tdx_get_cmd() helper to get and validate sub-ioctl command Add a helper to copy a kvm_tdx_cmd structure from userspace and verify that must-be-zero fields are indeed zero. No functional change intended. Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 88c85fd82a0e..f42cc955d49c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2792,20 +2792,29 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) return 0; } +static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) +{ + if (copy_from_user(cmd, argp, sizeof(*cmd))) + return -EFAULT; + + /* + * Userspace should never set hw_error. KVM writes hw_error to report + * hardware-defined error back to userspace. + */ + if (cmd->hw_error) + return -EINVAL; + + return 0; +} + int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_tdx_cmd tdx_cmd; int r; - if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) - return -EFAULT; - - /* - * Userspace should never set hw_error. It is used to fill - * hardware-defined error by the kernel. - */ - if (tdx_cmd.hw_error) - return -EINVAL; + r = tdx_get_cmd(argp, &tdx_cmd); + if (r) + return r; mutex_lock(&kvm->lock); @@ -3181,11 +3190,9 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; - if (copy_from_user(&cmd, argp, sizeof(cmd))) - return -EFAULT; - - if (cmd.hw_error) - return -EINVAL; + ret = tdx_get_cmd(argp, &cmd); + if (ret) + return ret; switch (cmd.id) { case KVM_TDX_INIT_VCPU: From 94428e3ba3258fc2862db3f9999e548d5a2d2a2a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:46 -0700 Subject: [PATCH 080/260] KVM: TDX: Convert INIT_MEM_REGION and INIT_VCPU to "unlocked" vCPU ioctl Handle the KVM_TDX_INIT_MEM_REGION and KVM_TDX_INIT_VCPU vCPU sub-ioctls in the unlocked variant, i.e. outside of vcpu->mutex, in anticipation of taking kvm->lock along with all other vCPU mutexes, at which point the sub-ioctls _must_ start without vcpu->mutex held. No functional change intended. Reviewed-by: Kai Huang Co-developed-by: Yan Zhao Signed-off-by: Yan Zhao Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/main.c | 9 +++++++ arch/x86/kvm/vmx/tdx.c | 42 +++++++++++++++++++++++++----- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 7 +++++ 6 files changed, 55 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index fdf178443f85..de709fb5bd76 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -128,6 +128,7 @@ KVM_X86_OP(enable_smi_window) KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 87a5f5100b1d..2bfae1cfa514 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1914,6 +1914,7 @@ struct kvm_x86_ops { int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); + int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0eb2773b2ae2..a46ccd670785 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } +static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_unlocked_ioctl(vcpu, argp); +} + static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { @@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl), .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index f42cc955d49c..3b4d62a3bf31 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3181,6 +3181,42 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c return ret; } +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int r; + + r = tdx_get_cmd(argp, &cmd); + if (r) + return r; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + + vcpu_load(vcpu); + + switch (cmd.id) { + case KVM_TDX_INIT_MEM_REGION: + r = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; + case KVM_TDX_INIT_VCPU: + r = tdx_vcpu_init(vcpu, &cmd); + break; + default: + r = -ENOIOCTLCMD; + break; + } + + vcpu_put(vcpu); + + mutex_unlock(&vcpu->mutex); + return r; +} + int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); @@ -3195,12 +3231,6 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; switch (cmd.id) { - case KVM_TDX_INIT_VCPU: - ret = tdx_vcpu_init(vcpu, &cmd); - break; - case KVM_TDX_INIT_MEM_REGION: - ret = tdx_vcpu_init_mem_region(vcpu, &cmd); - break; case KVM_TDX_GET_CPUID: ret = tdx_vcpu_get_cpuid(vcpu, &cmd); break; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9697368d65b3..a7a870919580 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -149,6 +149,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b85cb213a336..593fccc9cf1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7243,6 +7243,13 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_MEMORY_ENCRYPT_OP && + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); + return -ENOIOCTLCMD; } From 0b76e827b29db17a37870a71814e347970394c78 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:47 -0700 Subject: [PATCH 081/260] KVM: TDX: Use guard() to acquire kvm->lock in tdx_vm_ioctl() Use guard() in tdx_vm_ioctl() to tidy up the code a small amount, but more importantly to minimize the diff of a future change, which will use guard-like semantics to acquire and release multiple locks. No functional change intended. Reviewed-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 3b4d62a3bf31..a4818d9acbd5 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2816,7 +2816,7 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) if (r) return r; - mutex_lock(&kvm->lock); + guard(mutex)(&kvm->lock); switch (tdx_cmd.id) { case KVM_TDX_CAPABILITIES: @@ -2829,15 +2829,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) r = tdx_td_finalize(kvm, &tdx_cmd); break; default: - r = -EINVAL; - goto out; + return -EINVAL; } if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) - r = -EFAULT; + return -EFAULT; -out: - mutex_unlock(&kvm->lock); return r; } From f26061fe2c259d6f07f641fc8fe677637e06421b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:48 -0700 Subject: [PATCH 082/260] KVM: TDX: Don't copy "cmd" back to userspace for KVM_TDX_CAPABILITIES Don't copy the kvm_tdx_cmd structure back to userspace when handling KVM_TDX_CAPABILITIES, as tdx_get_capabilities() doesn't modify hw_error or any other fields. Opportunistically hoist the call to tdx_get_capabilities() outside of the kvm->lock critical section, as getting the capabilities doesn't touch the VM in any way, e.g. doesn't even take @kvm. Suggested-by: Kai Huang Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index a4818d9acbd5..73842b762e5f 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2816,12 +2816,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) if (r) return r; + if (tdx_cmd.id == KVM_TDX_CAPABILITIES) + return tdx_get_capabilities(&tdx_cmd); + guard(mutex)(&kvm->lock); switch (tdx_cmd.id) { - case KVM_TDX_CAPABILITIES: - r = tdx_get_capabilities(&tdx_cmd); - break; case KVM_TDX_INIT_VM: r = tdx_td_init(kvm, &tdx_cmd); break; From 15945e9ec1951d8cf7c1a7ec8c441017969d5aa1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:49 -0700 Subject: [PATCH 083/260] KVM: TDX: Guard VM state transitions with "all" the locks Acquire kvm->lock, kvm->slots_lock, and all vcpu->mutex locks when servicing ioctls that (a) transition the TD to a new state, i.e. when doing INIT or FINALIZE or (b) are only valid if the TD is in a specific state, i.e. when initializing a vCPU or memory region. Acquiring "all" the locks fixes several KVM_BUG_ON() situations where a SEAMCALL can fail due to racing actions, e.g. if tdh_vp_create() contends with either tdh_mr_extend() or tdh_mr_finalize(). For all intents and purposes, the paths in question are fully serialized, i.e. there's no reason to try and allow anything remotely interesting to happen. Smack 'em with a big hammer instead of trying to be "nice". Acquire kvm->lock to prevent VM-wide things from happening, slots_lock to prevent kvm_mmu_zap_all_fast(), and _all_ vCPU mutexes to prevent vCPUs from interefering. Use the recently-renamed kvm_arch_vcpu_unlocked_ioctl() to service the vCPU-scoped ioctls to avoid a lock inversion problem, e.g. due to taking vcpu->mutex outside kvm->lock. See also commit ecf371f8b02d ("KVM: SVM: Reject SEV{-ES} intra host migration if vCPU creation is in-flight"), which fixed a similar bug with SEV intra-host migration where an in-flight vCPU creation could race with a VM-wide state transition. Define a fancy new CLASS to handle the lock+check => unlock logic with guard()-like syntax: CLASS(tdx_vm_state_guard, guard)(kvm); if (IS_ERR(guard)) return PTR_ERR(guard); to simplify juggling the many locks. Note! Take kvm->slots_lock *after* all vcpu->mutex locks, as per KVM's soon-to-be-documented lock ordering rules[1]. Link: https://lore.kernel.org/all/20251016235538.171962-1-seanjc@google.com [1] Reported-by: Yan Zhao Closes: https://lore.kernel.org/all/aLFiPq1smdzN3Ary@yzhao56-desk.sh.intel.com Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 59 +++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 73842b762e5f..ab62aee2aaa4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2653,6 +2653,46 @@ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, return -EIO; } +typedef void *tdx_vm_state_guard_t; + +static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) +{ + int r; + + mutex_lock(&kvm->lock); + + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { + r = -EBUSY; + goto out_err; + } + + r = kvm_lock_all_vcpus(kvm); + if (r) + goto out_err; + + /* + * Note the unintuitive ordering! vcpu->mutex must be taken outside + * kvm->slots_lock! + */ + mutex_lock(&kvm->slots_lock); + return kvm; + +out_err: + mutex_unlock(&kvm->lock); + return ERR_PTR(r); +} + +static void tdx_release_vm_state_locks(struct kvm *kvm) +{ + mutex_unlock(&kvm->slots_lock); + kvm_unlock_all_vcpus(kvm); + mutex_unlock(&kvm->lock); +} + +DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, + if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), + tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); + static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -2774,8 +2814,6 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - guard(mutex)(&kvm->slots_lock); - if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; @@ -2819,7 +2857,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) if (tdx_cmd.id == KVM_TDX_CAPABILITIES) return tdx_get_capabilities(&tdx_cmd); - guard(mutex)(&kvm->lock); + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); switch (tdx_cmd.id) { case KVM_TDX_INIT_VM: @@ -3123,8 +3163,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c if (tdx->state != VCPU_TD_STATE_INITIALIZED) return -EINVAL; - guard(mutex)(&kvm->slots_lock); - /* Once TD is finalized, the initial guest memory is fixed. */ if (kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; @@ -3180,7 +3218,8 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { - struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_cmd cmd; int r; @@ -3188,12 +3227,13 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) if (r) return r; + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; - if (mutex_lock_killable(&vcpu->mutex)) - return -EINTR; - vcpu_load(vcpu); switch (cmd.id) { @@ -3210,7 +3250,6 @@ int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) vcpu_put(vcpu); - mutex_unlock(&vcpu->mutex); return r; } From ad44aa4c5d3f40f4254a25a2ebab8cb9be0e37a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:50 -0700 Subject: [PATCH 084/260] KVM: TDX: Bug the VM if extending the initial measurement fails WARN and terminate the VM if TDH_MR_EXTEND fails, as extending the measurement should fail if and only if there is a KVM bug, or if the S-EPT mapping is invalid. Now that KVM makes all state transitions mutually exclusive via tdx_vm_state_guard, it should be impossible for S-EPT mappings to be removed between kvm_tdp_mmu_map_private_pfn() and tdh_mr_extend(). Holding slots_lock prevents zaps due to memslot updates, filemap_invalidate_lock() prevents zaps due to guest_memfd PUNCH_HOLE, vcpu->mutex locks prevents updates from other vCPUs, kvm->lock prevents VM-scoped ioctls from creating havoc (e.g. by creating new vCPUs), and all usage of kvm_zap_gfn_range() is mutually exclusive with S-EPT entries that can be used for the initial image. For kvm_zap_gfn_range(), the call from sev.c is obviously mutually exclusive, TDX disallows KVM_X86_QUIRK_IGNORE_GUEST_PAT so the same goes for kvm_noncoherent_dma_assignment_start_or_stop(), and __kvm_set_or_clear_apicv_inhibit() is blocked by virtue of holding all VM and vCPU mutexes (and the APIC page has its own KVM-internal memslot that is never created for TDX VMs, and so can't possibly be used for the initial image, which means that too is mutually exclusive irrespective of locking). Opportunistically return early if the region doesn't need to be measured in order to reduce line lengths and avoid wraps. Similarly, immediately and explicitly return if TDH_MR_EXTEND fails to make it clear that KVM needs to bail entirely if extending the measurement fails. Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index ab62aee2aaa4..977cc41c96b4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3133,21 +3133,23 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, put_page(src_page); - if (ret) + if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) return ret; - if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { - for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { - err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, - &level_state); - if (err) { - ret = -EIO; - break; - } - } + /* + * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed + * between mapping the pfn and now, but slots_lock prevents memslot + * updates, filemap_invalidate_lock() prevents guest_memfd updates, + * mmu_notifier events can't reach S-EPT entries, and KVM's internal + * zapping flows are mutually exclusive with S-EPT mappings. + */ + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) + return -EIO; } - return ret; + return 0; } static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) From 1e3a825c9ec90f91ef61dc9cbda610d9ec887db8 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Thu, 30 Oct 2025 13:09:51 -0700 Subject: [PATCH 085/260] KVM: TDX: Fix list_add corruption during vcpu_load() During vCPU creation, a vCPU may be destroyed immediately after kvm_arch_vcpu_create() (e.g., due to vCPU id confiliction). However, the vcpu_load() inside kvm_arch_vcpu_create() may have associate the vCPU to pCPU via "list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu))" before invoking tdx_vcpu_free(). Though there's no need to invoke tdh_vp_flush() on the vCPU, failing to dissociate the vCPU from pCPU (i.e., "list_del(&to_tdx(vcpu)->cpu_list)") will cause list corruption of the per-pCPU list associated_tdvcpus. Then, a later list_add() during vcpu_load() would detect list corruption and print calltrace as shown below. Dissociate a vCPU from its associated pCPU in tdx_vcpu_free() for the vCPUs destroyed immediately after creation which must be in VCPU_TD_STATE_UNINITIALIZED state. kernel BUG at lib/list_debug.c:29! Oops: invalid opcode: 0000 [#2] SMP NOPTI RIP: 0010:__list_add_valid_or_report+0x82/0xd0 Call Trace: tdx_vcpu_load+0xa8/0x120 vt_vcpu_load+0x25/0x30 kvm_arch_vcpu_load+0x81/0x300 vcpu_load+0x55/0x90 kvm_arch_vcpu_create+0x24f/0x330 kvm_vm_ioctl_create_vcpu+0x1b1/0x53 kvm_vm_ioctl+0xc2/0xa60 __x64_sys_ioctl+0x9a/0xf0 x64_sys_call+0x10ee/0x20d0 do_syscall_64+0xc3/0x470 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: d789fa6efac9 ("KVM: TDX: Handle vCPU dissociation") Signed-off-by: Yan Zhao Reviewed-by: Kai Huang Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 43 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 977cc41c96b4..865d18b34988 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -843,19 +843,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu) tdx_prepare_switch_to_host(vcpu); } +/* + * Life cycles for a TD and a vCPU: + * 1. KVM_CREATE_VM ioctl. + * TD state is TD_STATE_UNINITIALIZED. + * hkid is not assigned at this stage. + * 2. KVM_TDX_INIT_VM ioctl. + * TD transitions to TD_STATE_INITIALIZED. + * hkid is assigned after this stage. + * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). + * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. + * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). + * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() + * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). + * 4. KVM_TDX_INIT_VCPU ioctl. + * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. + * vCPU control structures are allocated at this stage. + * 5. kvm_destroy_vm(). + * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. + * (2) puts hkid to !assigned state. + * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): + * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. + * 5.3 tdx_vm_destroy() + * transitions TD to TD_STATE_UNINITIALIZED state. + * + * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. + * - If at 3.3, hkid is still assigned, but the vCPU must be in + * VCPU_TD_STATE_UNINITIALIZED state. + * - if at 5.2, hkid must be !assigned and all vCPUs must be in + * VCPU_TD_STATE_INITIALIZED state and have been dissociated. + */ void tdx_vcpu_free(struct kvm_vcpu *vcpu) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); struct vcpu_tdx *tdx = to_tdx(vcpu); int i; + if (vcpu->cpu != -1) { + KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); + tdx_flush_vp_on_cpu(vcpu); + return; + } + /* * It is not possible to reclaim pages while hkid is assigned. It might - * be assigned if: - * 1. the TD VM is being destroyed but freeing hkid failed, in which - * case the pages are leaked - * 2. TD VCPU creation failed and this on the error path, in which case - * there is nothing to do anyway + * be assigned if the TD VM is being destroyed but freeing hkid failed, + * in which case the pages are leaked. */ if (is_hkid_assigned(kvm_tdx)) return; From 3d31bdf9cc79a3752bd1b6ba91af4e5ba37c47a8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:16 +0000 Subject: [PATCH 086/260] KVM: nSVM: Remove redundant cases in nested_svm_intercept() Both the CRx and DRx cases are doing exactly what the default case is doing, remove them. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-2-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 83de3456df70..71664d54d8b2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1438,16 +1438,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { /* * Host-intercepted exceptions have been checked already in From 5674a76db0213f9db1e4d08e847ff649b46889c0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:17 +0000 Subject: [PATCH 087/260] KVM: nSVM: Propagate SVM_EXIT_CR0_SEL_WRITE correctly for LMSW emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When emulating L2 instructions, svm_check_intercept() checks whether a write to CR0 should trigger a synthesized #VMEXIT with SVM_EXIT_CR0_SEL_WRITE. For MOV-to-CR0, SVM_EXIT_CR0_SEL_WRITE is only triggered if any bit other than CR0.MP and CR0.TS is updated. However, according to the APM (24593—Rev. 3.42—March 2024, Table 15-7): The LMSW instruction treats the selective CR0-write intercept as a non-selective intercept (i.e., it intercepts regardless of the value being written). Skip checking the changed bits for x86_intercept_lmsw and always inject SVM_EXIT_CR0_SEL_WRITE. Fixes: cfec82cb7d31 ("KVM: SVM: Add intercept check for emulated cr accesses") Cc: stable@vger.kernel.org Reported-by: Matteo Rizzo Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f14709a511aa..bd8df212a59d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4546,20 +4546,20 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, INTERCEPT_SELECTIVE_CR0))) break; - cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; - val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; - + /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ if (info->intercept == x86_intercept_lmsw) { - cr0 &= 0xfUL; - val &= 0xfUL; - /* lmsw can't clear PE - catch this here */ - if (cr0 & X86_CR0_PE) - val |= X86_CR0_PE; + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + break; } + /* + * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit + * other than SVM_CR0_SELECTIVE_MASK is changed. + */ + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; if (cr0 ^ val) icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; - break; } case SVM_EXIT_READ_DR0: From 3d80f4c93d3d26d0f9a0dd2844961a632eeea634 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:18 +0000 Subject: [PATCH 088/260] KVM: nSVM: Avoid incorrect injection of SVM_EXIT_CR0_SEL_WRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When emulating L2 instructions, svm_check_intercept() checks whether a write to CR0 should trigger a synthesized #VMEXIT with SVM_EXIT_CR0_SEL_WRITE. However, it does not check whether L1 enabled the intercept for SVM_EXIT_WRITE_CR0, which has higher priority according to the APM (24593—Rev. 3.42—March 2024, Table 15-7): When both selective and non-selective CR0-write intercepts are active at the same time, the non-selective intercept takes priority. With respect to exceptions, the priority of this intercept is the same as the generic CR0-write intercept. Make sure L1 does NOT intercept SVM_EXIT_WRITE_CR0 before checking if SVM_EXIT_CR0_SEL_WRITE needs to be injected. Opportunistically tweak the "not CR0" logic to explicitly bail early so that it's more obvious that only CR0 has a selective intercept, and that modifying icpt_info.exit_code is functionally necessary so that the call to nested_svm_exit_handled() checks the correct exit code. Fixes: cfec82cb7d31 ("KVM: SVM: Add intercept check for emulated cr accesses") Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-4-yosry.ahmed@linux.dev [sean: isolate non-CR0 write logic, tweak comments accordingly] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd8df212a59d..1ae7b3c5a7c5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4535,15 +4535,29 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - if (info->intercept == x86_intercept_cr_write) + /* + * Adjust the exit code accordingly if a CR other than CR0 is + * being written, and skip straight to the common handling as + * only CR0 has an additional selective intercept. + */ + if (info->intercept == x86_intercept_cr_write && info->modrm_reg) { icpt_info.exit_code += info->modrm_reg; + break; + } - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || - info->intercept == x86_intercept_clts) + /* + * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a + * selective CR0 intercept is triggered (the common logic will + * treat the selective intercept as being enabled). Note, the + * unconditional intercept has higher priority, i.e. this is + * only relevant if *only* the selective intercept is enabled. + */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) || + !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; - if (!(vmcb12_is_intercept(&svm->nested.ctl, - INTERCEPT_SELECTIVE_CR0))) + /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */ + if (info->intercept == x86_intercept_clts) break; /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ From 32ed0bc2f0f8ce411a822531c71b49fa93608b37 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 20 Aug 2025 11:59:54 +0200 Subject: [PATCH 089/260] KVM: VMX: Ensure guest's SPEC_CTRL[63:32] is loaded on VM-Enter SPEC_CTRL is an MSR, i.e. a 64-bit value, but the assembly code that loads the guest's value assumes bits 63:32 are always zero. The bug is _currently_ benign because neither KVM nor the kernel support setting any of bits 63:32, but it's still a bug that needs to be fixed. Note, the host's value is restored in C code and is unaffected. Fixes: 07853adc29a0 ("KVM: VMX: Prevent RSB underflow before vmenter") Suggested-by: Sean Christopherson Signed-off-by: Uros Bizjak Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Josh Poimboeuf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Link: https://patch.msgid.link/20250820100007.356761-1-ubizjak@gmail.com [sean: call out that only the guest's value is affected] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bc255d709d8a..574159a84ee9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -118,13 +118,23 @@ SYM_FUNC_START(__vmx_vcpu_run) * and vmentry. */ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI - movl VMX_spec_ctrl(%_ASM_DI), %edi - movl PER_CPU_VAR(x86_spec_ctrl_current), %esi - cmp %edi, %esi +#ifdef CONFIG_X86_64 + mov VMX_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx je .Lspec_ctrl_done + movl %edx, %eax + shr $32, %rdx +#else + mov VMX_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov VMX_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi + xor %edx, %edi + or %edi, %ecx + je .Lspec_ctrl_done +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx - mov %edi, %eax wrmsr .Lspec_ctrl_done: From c331b400e291a510eb9a0dbdc783b38e6f8321f0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 6 Nov 2025 11:12:30 -0800 Subject: [PATCH 090/260] KVM: SVM: Ensure SPEC_CTRL[63:32] is context switched between guest and host SPEC_CTRL is an MSR, i.e. a 64-bit value, but the VMRUN assembly code assumes bits 63:32 are always zero. The bug is _currently_ benign because neither KVM nor the kernel support setting any of bits 63:32, but it's still a bug that needs to be fixed. Signed-off-by: Uros Bizjak Suggested-by: Sean Christopherson Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251106191230.182393-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 47 ++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 235c4af6b692..98bfa2e00d88 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -52,11 +52,23 @@ * there must not be any returns or indirect branches between this code * and vmentry. */ - movl SVM_spec_ctrl(%_ASM_DI), %eax - cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax +#ifdef CONFIG_X86_64 + mov SVM_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx je 801b + movl %edx, %eax + shr $32, %rdx +#else + mov SVM_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov SVM_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi + xor %edx, %esi + or %esi, %ecx + je 801b +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx wrmsr jmp 801b .endm @@ -81,13 +93,25 @@ jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) + movl %edx, SVM_spec_ctrl + 4(%_ASM_DI) 998: - /* Now restore the host value of the MSR if different from the guest's. */ - movl PER_CPU_VAR(x86_spec_ctrl_current), %eax - cmp SVM_spec_ctrl(%_ASM_DI), %eax +#ifdef CONFIG_X86_64 + mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx + cmp SVM_spec_ctrl(%rdi), %rdx je 901b - xor %edx, %edx + movl %edx, %eax + shr $32, %rdx +#else + mov PER_CPU_VAR(x86_spec_ctrl_current), %eax + mov SVM_spec_ctrl(%edi), %esi + xor %eax, %esi + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx + mov SVM_spec_ctrl + 4(%edi), %edi + xor %edx, %edi + or %edi, %esi + je 901b +#endif wrmsr jmp 901b .endm @@ -134,7 +158,7 @@ SYM_FUNC_START(__svm_vcpu_run) mov %_ASM_ARG1, %_ASM_DI .endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* @@ -211,7 +235,10 @@ SYM_FUNC_START(__svm_vcpu_run) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX. */ + /* + * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm) + * and RSP (pointer to @spec_ctrl_intercepted). + */ RESTORE_HOST_SPEC_CTRL /* @@ -331,7 +358,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov %rdi, SEV_ES_RDI (%rdx) mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX (@hostsa). */ + /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ From 68c35f89d016dd0ebcc4a0298e63aa7981fca9e0 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Oct 2025 23:32:57 -0400 Subject: [PATCH 091/260] KVM: x86: Fix a semi theoretical bug in kvm_arch_async_page_present_queued() Fix a semi theoretical race condition related to a lack of memory barriers when dealing with vcpu->arch.apf.pageready_pending. In theory, the "ready" side could see a stale pageready_pending and neglect to kick the vCPU, and thus allow the vCPU to enter the guest with a pending KVM_REQ_APF_READY and no kick/IPI on the way, in which case the KVM would fail to deliver a completed async #PF event to the guest in a timely manner as the request would be recognized only on the next (coincidental) VM-Exit. kvm_arch_async_page_present_queued() running in workqueue context: kvm_make_request(KVM_REQ_APF_READY, vcpu); /* memory barrier is missing here*/ if (!vcpu->arch.apf.pageready_pending) kvm_vcpu_kick(vcpu); kvm_set_msr_common() running in task context: vcpu->arch.apf.pageready_pending = false; /* memory barrier is missing here*/ And later, vcpu_enter_guest() running in task context: if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu) Add missing full memory barriers in both cases to avoid theoretical case of not kicking the vCPU thread. Note that the bug is mostly theoretical because kvm_make_request() uses an atomic operation, which is always serializing on x86, requiring only for documentation purposes the smp_mb__after_atomic() after it (smp_mb__after_atomic() is a NOP on x86). The second missing barrier, between kvm_set_msr_common() and vcpu_enter_guest(), isn't strictly needed because KVM executes several barriers in between calling these functions, however it still makes sense to have an explicit barrier to be on the safe side and to document the ordering dependencies. Finally, also use READ_ONCE/WRITE_ONCE. Thanks a lot to Paolo for the help with this patch. Link: https://lore.kernel.org/all/7c7a5a75-a786-4a05-a836-4368582ca4c2@redhat.com Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Link: https://patch.msgid.link/20251015033258.50974-3-mlevitsk@redhat.com [sean: explain the race and its impact in more detail] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c826cd05228a..57ade075bae3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4183,7 +4183,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { - vcpu->arch.apf.pageready_pending = false; + /* + * Pairs with the smp_mb__after_atomic() in + * kvm_arch_async_page_present_queued(). + */ + smp_store_mb(vcpu->arch.apf.pageready_pending, false); + kvm_check_async_pf_completion(vcpu); } break; @@ -13890,7 +13895,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, if ((work->wakeup_all || work->notpresent_injected) && kvm_pv_async_pf_enabled(vcpu) && !apf_put_user_ready(vcpu, work->arch.token)) { - vcpu->arch.apf.pageready_pending = true; + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); kvm_apic_set_irq(vcpu, &irq, NULL); } @@ -13901,7 +13906,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_APF_READY, vcpu); - if (!vcpu->arch.apf.pageready_pending) + + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ + smp_mb__after_atomic(); + + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) kvm_vcpu_kick(vcpu); } From ab4e41eb9fabd4607304fa7cfe8ec9c0bd8e1552 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 14 Oct 2025 23:32:58 -0400 Subject: [PATCH 092/260] KVM: x86: Don't clear async #PF queue when CR0.PG is disabled (e.g. on #SMI) Fix an interaction between SMM and PV asynchronous #PFs where an #SMI can cause KVM to drop an async #PF ready event, and thus result in guest tasks becoming permanently stuck due to the task that encountered the #PF never being resumed. Specifically, don't clear the completion queue when paging is disabled, and re-check for completed async #PFs if/when paging is enabled. Prior to commit 2635b5c4a0e4 ("KVM: x86: interrupt based APF 'page ready' event delivery"), flushing the APF queue without notifying the guest of completed APF requests when paging is disabled was "necessary", in that delivering a #PF to the guest when paging is disabled would likely confuse and/or crash the guest. And presumably the original async #PF development assumed that a guest would only disable paging when there was no intent to ever re-enable paging. That assumption fails in several scenarios, most visibly on an emulated SMI, as entering SMM always disables CR0.PG (i.e. initially runs with paging disabled). When the SMM handler eventually executes RSM, the interrupted paging-enabled is restored, and the async #PF event is lost. Similarly, invoking firmware, e.g. via EFI runtime calls, might require a transition through paging modes and thus also disable paging with valid entries in the competion queue. To avoid dropping completion events, drop the "clear" entirely, and handle paging-enable transitions in the same way KVM already handles APIC enable/disable events: if a vCPU's APIC is disabled, APF completion events are not kept pending and not injected while APIC is disabled. Once a vCPU's APIC is re-enabled, KVM raises KVM_REQ_APF_READY so that the vCPU recognizes any pending pending #APF ready events. Signed-off-by: Maxim Levitsky Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251015033258.50974-4-mlevitsk@redhat.com [sean: rework changelog to call out #PF injection, drop "real mode" references, expand the code comment] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57ade075bae3..877c8766c551 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1045,6 +1045,13 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); +static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} + static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -1137,15 +1144,20 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon } if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - /* * Clearing CR0.PG is defined to flush the TLB from the guest's * perspective. */ if (!(cr0 & X86_CR0_PG)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* + * Check for async #PF completion events when enabling paging, + * as the vCPU may have previously encountered async #PFs (it's + * entirely legal for the guest to toggle paging on/off without + * waiting for the async #PF queue to drain). + */ + else if (kvm_pv_async_pf_enabled(vcpu)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) @@ -3650,13 +3662,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) -{ - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - - return (vcpu->arch.apf.msr_en_val & mask) == mask; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; From c0711f8c610e1634ed54fb04da1e82252730306a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 12:15:25 -0700 Subject: [PATCH 093/260] KVM: TDX: Explicitly set user-return MSRs that *may* be clobbered by the TDX-Module Set all user-return MSRs to their post-TD-exit value when preparing to run a TDX vCPU to ensure the value that KVM expects to be loaded after running the vCPU is indeed the value that's loaded in hardware. If the TDX-Module doesn't actually enter the guest, i.e. doesn't do VM-Enter, then it won't "restore" VMM state, i.e. won't clobber user-return MSRs to their expected post-run values, in which case simply updating KVM's "cached" value will effectively corrupt the cache due to hardware still holding the original value. In theory, KVM could conditionally update the current user-return value if and only if tdh_vp_enter() succeeds, but in practice "success" doesn't guarantee the TDX-Module actually entered the guest, e.g. if the TDX-Module synthesizes an EPT Violation because it suspects a zero-step attack. Force-load the expected values instead of trying to decipher whether or not the TDX-Module restored/clobbered MSRs, as the risk doesn't justify the benefits. Effectively avoiding four WRMSRs once per run loop (even if the vCPU is scheduled out, user-return MSRs only need to be reloaded if the CPU exits to userspace or runs a non-TDX vCPU) is likely in the noise when amortized over all entries, given the cost of running a TDX vCPU. E.g. the cost of the WRMSRs is somewhere between ~300 and ~500 cycles, whereas the cost of a _single_ roundtrip to/from a TDX guest is thousands of cycles. Fixes: e0b4f31a3c65 ("KVM: TDX: restore user ret MSRs") Cc: stable@vger.kernel.org Cc: Yan Zhao Cc: Xiaoyao Li Cc: Rick Edgecombe Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20251030191528.3380553-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx/tdx.c | 56 ++++++++++++++------------------- arch/x86/kvm/vmx/tdx.h | 1 - arch/x86/kvm/x86.c | 9 ------ 4 files changed, 23 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4fbe4b7ce1da..a557c504c1a4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2379,7 +2379,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); -void kvm_user_return_msr_update_cache(unsigned int index, u64 val); u64 kvm_get_user_return_msr(unsigned int slot); static inline bool kvm_is_supported_user_return_msr(u32 msr) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 67c190ce8104..163f854a39f2 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -763,25 +763,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) return tdx_vcpu_state_details_intr_pending(vcpu_state_details); } -/* - * Compared to vmx_prepare_switch_to_guest(), there is not much to do - * as SEAMCALL/SEAMRET calls take care of most of save and restore. - */ -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) -{ - struct vcpu_vt *vt = to_vt(vcpu); - - if (vt->guest_state_loaded) - return; - - if (likely(is_64bit_mm(current->mm))) - vt->msr_host_kernel_gs_base = current->thread.gsbase; - else - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - - vt->guest_state_loaded = true; -} - struct tdx_uret_msr { u32 msr; unsigned int slot; @@ -795,19 +776,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = { {.msr = MSR_TSC_AUX,}, }; -static void tdx_user_return_msr_update_cache(void) +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { + struct vcpu_vt *vt = to_vt(vcpu); int i; + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->guest_state_loaded = true; + + /* + * Explicitly set user-return MSRs that are clobbered by the TDX-Module + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be + * written by the TDX-Module. Don't rely on the TDX-Module to actually + * clobber the MSRs, as the contract is poorly defined and not upheld. + * E.g. the TDX-Module will synthesize an EPT Violation without doing + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM + * state. + */ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, - tdx_uret_msrs[i].defval); + kvm_set_user_return_msr(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval, -1ull); } static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); - struct vcpu_tdx *tdx = to_tdx(vcpu); if (!vt->guest_state_loaded) return; @@ -815,11 +815,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); - if (tdx->guest_entered) { - tdx_user_return_msr_update_cache(); - tdx->guest_entered = false; - } - vt->guest_state_loaded = false; } @@ -1059,7 +1054,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) update_debugctlmsr(vcpu->arch.host_debugctl); tdx_load_host_xsave_state(vcpu); - tdx->guest_entered = true; vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; @@ -3443,10 +3437,6 @@ static int __init __tdx_bringup(void) /* * Check if MSRs (tdx_uret_msrs) can be saved/restored * before returning to user space. - * - * this_cpu_ptr(user_return_msrs)->registered isn't checked - * because the registration is done at vcpu runtime by - * tdx_user_return_msr_update_cache(). */ tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); if (tdx_uret_msrs[i].slot == -1) { diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ca39a9391db1..7f258870dc41 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -67,7 +67,6 @@ struct vcpu_tdx { u64 vp_enter_ret; enum vcpu_tdx_state state; - bool guest_entered; u64 map_gpa_next; u64 map_gpa_end; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 877c8766c551..f4ce4292eb52 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -681,15 +681,6 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) -{ - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); - - msrs->values[slot].curr = value; - kvm_user_return_register_notifier(msrs); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); - u64 kvm_get_user_return_msr(unsigned int slot) { return this_cpu_ptr(user_return_msrs)->values[slot].curr; From b371174d2fa60bbbb730a1a5292c865d12036c2a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 12:15:26 -0700 Subject: [PATCH 094/260] KVM: x86: WARN if user-return MSR notifier is registered on exit When freeing the per-CPU user-return MSRs structures, WARN if any CPU has a registered notifier to help detect and/or debug potential use-after-free issues. The lifecycle of the notifiers is rather convoluted, and has several non-obvious paths where notifiers are unregistered, i.e. isn't exactly the most robust code possible. The notifiers they are registered on-demand in KVM, on the first WRMSR to a tracked register. _Usually_ the notifier is unregistered whenever the CPU returns to userspace. But because any given CPU isn't guaranteed to return to userspace, e.g. the CPU could be offlined before doing so, KVM also "drops", a.k.a. unregisters, the notifiers when virtualization is disabled on the CPU. Further complicating the unregister path is the fact that the calls to disable virtualization come from common KVM, and the per-CPU calls are guarded by a per-CPU flag (to harden _that_ code against bugs, e.g. due to mishandling reboot). Reboot/shutdown in particular is problematic, as KVM disables virtualization via IPI function call, i.e. from IRQ context, instead of using the cpuhp framework, which runs in task context. I.e. on reboot/shutdown, drop_user_return_notifiers() is called asynchronously. Forced reboot/shutdown is the most problematic scenario, as userspace tasks are not frozen before kvm_shutdown() is invoked, i.e. KVM could be actively manipulating the user-return MSR lists and/or notifiers when the IPI arrives. To a certain extent, all bets are off when userspace forces a reboot/shutdown, but KVM should at least avoid a use-after-free, e.g. to avoid crashing the kernel when trying to reboot. Link: https://patch.msgid.link/20251030191528.3380553-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4ce4292eb52..4c089b11495a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -575,6 +575,27 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +static int kvm_init_user_return_msrs(void) +{ + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + pr_err("failed to allocate percpu user_return_msrs\n"); + return -ENOMEM; + } + kvm_nr_uret_msrs = 0; + return 0; +} + +static void kvm_free_user_return_msrs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu_ptr(user_return_msrs, cpu)->registered); + + free_percpu(user_return_msrs); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -10045,13 +10066,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -ENOMEM; } - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); - if (!user_return_msrs) { - pr_err("failed to allocate percpu kvm_user_return_msrs\n"); - r = -ENOMEM; + r = kvm_init_user_return_msrs(); + if (r) goto out_free_x86_emulator_cache; - } - kvm_nr_uret_msrs = 0; r = kvm_mmu_vendor_module_init(); if (r) @@ -10154,7 +10171,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) out_mmu_exit: kvm_mmu_vendor_module_exit(); out_free_percpu: - free_percpu(user_return_msrs); + kvm_free_user_return_msrs(); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; @@ -10183,7 +10200,7 @@ void kvm_x86_vendor_exit(void) #endif kvm_x86_call(hardware_unsetup)(); kvm_mmu_vendor_module_exit(); - free_percpu(user_return_msrs); + kvm_free_user_return_msrs(); kmem_cache_destroy(x86_emulator_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); From 2baa33a8ddd61feb1347db95271f157fd9e9d53d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 12:15:27 -0700 Subject: [PATCH 095/260] KVM: x86: Leave user-return notifier registered on reboot/shutdown Leave KVM's user-return notifier registered in the unlikely case that the notifier is registered when disabling virtualization via IPI callback in response to reboot/shutdown. On reboot/shutdown, keeping the notifier registered is ok as far as MSR state is concerned (arguably better then restoring MSRs at an unknown point in time), as the callback will run cleanly and restore host MSRs if the CPU manages to return to userspace before the system goes down. The only wrinkle is that if kvm.ko module unload manages to race with reboot/shutdown, then leaving the notifier registered could lead to use-after-free due to calling into unloaded kvm.ko module code. But such a race is only possible on --forced reboot/shutdown, because otherwise userspace tasks would be frozen before kvm_shutdown() is called, i.e. on a "normal" reboot/shutdown, it should be impossible for the CPU to return to userspace after kvm_shutdown(). Furthermore, on a --forced reboot/shutdown, unregistering the user-return hook from IRQ context doesn't fully guard against use-after-free, because KVM could immediately re-register the hook, e.g. if the IRQ arrives before kvm_user_return_register_notifier() is called. Rather than trying to guard against the IPI in the "normal" user-return code, which is difficult and noisy, simply leave the user-return notifier registered on a reboot, and bump the kvm.ko module refcount to defend against a use-after-free due to kvm.ko unload racing against reboot. Alternatively, KVM could allow kvm.ko and try to drop the notifiers during kvm_x86_exit(), but that's also a can of worms as registration is per-CPU, and so KVM would need to blast an IPI, and doing so while a reboot/shutdown is in-progress is far risky than preventing userspace from unloading KVM. Link: https://patch.msgid.link/20251030191528.3380553-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c089b11495a..3bc3d527f442 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13108,7 +13108,21 @@ int kvm_arch_enable_virtualization_cpu(void) void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); - drop_user_return_notifiers(); + + /* + * Leave the user-return notifiers as-is when disabling virtualization + * for reboot, i.e. when disabling via IPI function call, and instead + * pin kvm.ko (if it's a module) to defend against use-after-free (in + * the *very* unlikely scenario module unload is racing with reboot). + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM + * could be actively modifying user-return MSR state when the IPI to + * disable virtualization arrives. Handle the extreme edge case here + * instead of trying to account for it in the normal flows. + */ + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) + drop_user_return_notifiers(); + else + __module_get(THIS_MODULE); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) From 995d504100cf66d846461133f8862b483295f995 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Thu, 30 Oct 2025 12:15:28 -0700 Subject: [PATCH 096/260] KVM: x86: Don't disable IRQs when unregistering user-return notifier Remove the code to disable IRQs when unregistering KVM's user-return notifier now that KVM doesn't invoke kvm_on_user_return() when disabling virtualization via IPI function call, i.e. now that there's no need to guard against re-entrancy via IPI callback. Note, disabling IRQs has largely been unnecessary since commit a377ac1cd9d7b ("x86/entry: Move user return notifier out of loop") moved fire_user_return_notifiers() into the section with IRQs disabled. In doing so, the commit somewhat inadvertently fixed the underlying issue that was papered over by commit 1650b4ebc99d ("KVM: Disable irq while unregistering user notifier"). I.e. in practice, the code and comment has been stale since commit a377ac1cd9d7b. Signed-off-by: Hou Wenlong [sean: rewrite changelog after rebasing, drop lockdep assert] Reviewed-by: Kai Huang Link: https://patch.msgid.link/20251030191528.3380553-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3bc3d527f442..97852f190720 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,18 +602,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; - unsigned long flags; - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_disable_virtualization_cpu() - */ - local_irq_save(flags); - if (msrs->registered) { - msrs->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); + msrs->registered = false; + user_return_notifier_unregister(urn); + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { From dfd1572a64c90770a2bddfab9bbb69932217b1da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Nov 2025 12:51:14 -0800 Subject: [PATCH 097/260] KVM: VMX: Make loaded_vmcs_clear() static in vmx.c Make loaded_vmcs_clear() local to vmx.c as there are no longer any external callers. No functional change intended. Link: https://patch.msgid.link/20251106205114.218226-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1021d3b65ea0..d484f60fbeba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -752,7 +752,7 @@ static void __loaded_vmcs_clear(void *arg) loaded_vmcs->launched = 0; } -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6cb04a6afeef..bc3ed3145d7e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -680,7 +680,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { From ad9c62bd8946621ed02ac94131a921222508a8bc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:01 +0000 Subject: [PATCH 098/260] KVM: arm64: VM exit to userspace to handle SEA When APEI fails to handle a stage-2 synchronous external abort (SEA), today KVM injects an asynchronous SError to the VCPU then resumes it, which usually results in unpleasant guest kernel panic. One major situation of guest SEA is when vCPU consumes recoverable uncorrected memory error (UER). Although SError and guest kernel panic effectively stops the propagation of corrupted memory, guest may re-use the corrupted memory if auto-rebooted; in worse case, guest boot may run into poisoned memory. So there is room to recover from an UER in a more graceful manner. Alternatively KVM can redirect the synchronous SEA event to VMM to - Reduce blast radius if possible. VMM can inject a SEA to VCPU via KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison consumption or fault is not from guest kernel, blast radius can be limited to the triggering thread in guest userspace, so VM can keep running. - Allow VMM to protect from future memory poison consumption by unmapping the page from stage-2, or to interrupt guest of the poisoned page so guest kernel can unmap it from stage-1 page table. - Allow VMM to track SEA events that VM customers care about, to restart VM when certain number of distinct poison events have happened, to provide observability to customers in log management UI. Introduce an userspace-visible feature to enable VMM handle SEA: - KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior when host APEI fails to claim a SEA, userspace can opt in this new capability to let KVM exit to userspace during SEA if it is not owned by host. - KVM_EXIT_ARM_SEA. A new exit reason is introduced for this. KVM fills kvm_run.arm_sea with as much as possible information about the SEA, enabling VMM to emulate SEA to guest by itself. - Sanitized ESR_EL2. The general rule is to keep only the bits useful for userspace and relevant to guest memory. - Flags indicating if faulting guest physical address is valid. - Faulting guest physical and virtual addresses if valid. Signed-off-by: Jiaqi Yan Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Link: https://msgid.link/20251013185903.1372553-2-jiaqiyan@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/arm.c | 5 +++ arch/arm64/kvm/mmu.c | 68 ++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 10 +++++ 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 64302c438355..366bf337ef64 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -350,6 +350,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 + /* Unhandled SEAs are taken to userspace */ +#define KVM_ARCH_FLAG_EXIT_SEA 11 unsigned long flags; /* VM-wide vCPU feature set */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..511d2e8ef6c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; default: break; } @@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..58cb169727a6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1899,8 +1899,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_unlock(&vcpu->kvm->mmu_lock); } +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + /* * Give APEI the opportunity to claim the abort before handling it * within KVM. apei_claim_sea() expects to be called with IRQs enabled. @@ -1909,7 +1949,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) if (apei_claim_sea(NULL) == 0) return 1; - return kvm_inject_serror(vcpu); + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; } /** diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..1e541193e98d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -963,6 +972,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 struct kvm_irq_routing_irqchip { __u32 irqchip; From feee9ef7ac1648835c3b1e35ba4833cf7c8b3669 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:02 +0000 Subject: [PATCH 099/260] KVM: selftests: Test for KVM_EXIT_ARM_SEA Test how KVM handles guest SEA when APEI is unable to claim it, and KVM_CAP_ARM_SEA_TO_USER is enabled. The behavior is triggered by consuming recoverable memory error (UER) injected via EINJ. The test asserts two major things: 1. KVM returns to userspace with KVM_EXIT_ARM_SEA exit reason, and has provided expected fault information, e.g. esr, flags, gva, gpa. 2. Userspace is able to handle KVM_EXIT_ARM_SEA by injecting SEA to guest and KVM injects expected SEA into the VCPU. Tested on a data center server running Siryn AmpereOne processor that has RAS support. Several things to notice before attempting to run this selftest: - The test relies on EINJ support in both firmware and kernel to inject UER. Otherwise the test will be skipped. - The under-test platform's APEI should be unable to claim the SEA. Otherwise the test will be skipped. - Some platform doesn't support notrigger in EINJ, which may cause APEI and GHES to offline the memory before guest can consume injected UER, and making test unable to trigger SEA. Signed-off-by: Jiaqi Yan Link: https://msgid.link/20251013185903.1372553-3-jiaqiyan@google.com Signed-off-by: Oliver Upton --- tools/arch/arm64/include/asm/esr.h | 2 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/arm64/sea_to_user.c | 331 ++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 1 + 4 files changed, 335 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca81571..0fa17b3af1f7 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -141,6 +141,8 @@ #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..02a7663c097b 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -163,6 +163,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 000000000000..573dd790aeb8 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..234b0b7bf6e8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2025,6 +2025,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* From 4debb5e8952e43c06c183a2efe2dd7820c55f196 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:03 +0000 Subject: [PATCH 100/260] Documentation: kvm: new UAPI for handling SEA Document the new userspace-visible features and APIs for handling synchronous external abort (SEA) - KVM_CAP_ARM_SEA_TO_USER: How userspace enables the new feature. - KVM_EXIT_ARM_SEA: exit userspace gets when it needs to handle SEA and what userspace gets while taking the SEA. Signed-off-by: Jiaqi Yan Link: https://msgid.link/20251013185903.1372553-4-jiaqiyan@google.com [ oliver: make documentation concise, remove implementation detail ] Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..27f726ff8fe0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7286,6 +7286,41 @@ exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the ``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. Userspace need not do anything if it does not wish to support a TDVMCALL. + +:: + + /* KVM_EXIT_ARM_SEA */ + struct { + #define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; + +Used on arm64 systems. When the VM capability ``KVM_CAP_ARM_SEA_TO_USER`` is +enabled, a KVM exits to userspace if a guest access causes a synchronous +external abort (SEA) and the host APEI fails to handle the SEA. + +``esr`` is set to a sanitized value of ESR_EL2 from the exception taken to KVM, +consisting of the following fields: + + - ``ESR_EL2.EC`` + - ``ESR_EL2.IL`` + - ``ESR_EL2.FnV`` + - ``ESR_EL2.EA`` + - ``ESR_EL2.CM`` + - ``ESR_EL2.WNR`` + - ``ESR_EL2.FSC`` + - ``ESR_EL2.SET`` (when FEAT_RAS is implemented for the VM) + +``gva`` is set to the value of FAR_EL2 from the exception taken to KVM when +``ESR_EL2.FnV == 0``. Otherwise, the value of ``gva`` is unknown. + +``gpa`` is set to the faulting IPA from the exception taken to KVM when +the ``KVM_EXIT_ARM_SEA_FLAG_GPA_VALID`` flag is set. Otherwise, the value of +``gpa`` is unknown. + :: /* Fix the size of the union. */ @@ -8703,6 +8738,18 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_ARM_SEA_TO_USER +---------------------------- + +:Architecture: arm64 +:Target: VM +:Parameters: none +:Returns: 0 on success, -EINVAL if unsupported. + +When this capability is enabled, KVM may exit to userspace for SEAs taken to +EL2 resulting from a guest access. See ``KVM_EXIT_ARM_SEA`` for more +information. + 8. Other capabilities. ====================== From 9a89894f30d5795c5ac46a9ee9c009d610306025 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 27 Oct 2025 17:28:24 -0700 Subject: [PATCH 101/260] KVM: TDX: Take MMU lock around tdh_vp_init() Take MMU lock around tdh_vp_init() in KVM_TDX_INIT_VCPU to prevent meeting contention during retries in some no-fail MMU paths. The TDX module takes various try-locks internally, which can cause SEAMCALLs to return an error code when contention is met. Dealing with an error in some of the MMU paths that make SEAMCALLs is not straight forward, so KVM takes steps to ensure that these will meet no contention during a single BUSY error retry. The whole scheme relies on KVM to take appropriate steps to avoid making any SEAMCALLs that could contend while the retry is happening. Unfortunately, there is a case where contention could be met if userspace does something unusual. Specifically, hole punching a gmem fd while initializing the TD vCPU. The impact would be triggering a KVM_BUG_ON(). The resource being contended is called the "TDR resource" in TDX docs parlance. The tdh_vp_init() can take this resource as exclusive if the 'version' passed is 1, which happens to be version the kernel passes. The various MMU operations (tdh_mem_range_block(), tdh_mem_track() and tdh_mem_page_remove()) take it as shared. There isn't a KVM lock that maps conceptually and in a lock order friendly way to the TDR lock. So to minimize infrastructure, just take MMU lock around tdh_vp_init(). This makes the operations we care about mutually exclusive. Since the other operations are under a write mmu_lock, the code could just take the lock for read, however this is weirdly inverted from the actual underlying resource being contended. Since this is covering an edge case that shouldn't be hit in normal usage, be a little less weird and take the mmu_lock for write around the call. Fixes: 02ab57707bdb ("KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table") Reported-by: Yan Zhao Suggested-by: Yan Zhao Link: https://patch.msgid.link/20251028002824.1470939-1-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe [sean: tweak comment and capture PUNCH_HOLE interaction] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 865d18b34988..b5087d40f50b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2969,9 +2969,20 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } } - err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); - if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) - return -EIO; + /* + * tdh_vp_init() can take an exclusive lock of the TDR resource inside + * the TDX-Module. The TDR resource is also taken as shared in several + * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention + * (TDX-Module locks are try-lock implementations with no slow path). + * Take mmu_lock for write to reflect the nature of the lock taken by + * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if + * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) + return -EIO; + } vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; From 228add34dc2f197539c497bf5573d75cd58f5c61 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 3 Nov 2025 15:44:37 -0800 Subject: [PATCH 102/260] KVM: TDX: Remove __user annotation from kernel pointer Separate __user pointer variable declaration from kernel one. There are two 'kvm_cpuid2' pointers involved here. There's an "input" side: 'td_cpuid' which is a normal kernel pointer and an 'output' side. The output here is userspace and there is an attempt at properly annotating the variable with __user: struct kvm_cpuid2 __user *output, *td_cpuid; But, alas, this is wrong. The __user in the definition applies to both 'output' and 'td_cpuid'. Sparse notices the address space mismatch and will complain about it. Fix it up by completely separating the two definitions so that it is obviously correct without even having to know what the C syntax rules even are. Signed-off-by: Dave Hansen Fixes: 488808e682e7 ("KVM: x86: Introduce KVM_TDX_GET_CPUID") Reviewed-by: Rick Edgecombe Cc: Xiaoyao Li Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Rick Edgecombe Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Xiaoyao Li Acked-by: Kiryl Shutsemau Link: https://patch.msgid.link/20251103234437.A0532420@davehans-spike.ostc.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b5087d40f50b..3525e0e9d073 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3038,7 +3038,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) { - struct kvm_cpuid2 __user *output, *td_cpuid; + struct kvm_cpuid2 __user *output; + struct kvm_cpuid2 *td_cpuid; int r = 0, i = 0, leaf; u32 level; From 27376465e945c11ad13c1e1d877ed318be010062 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 3 Nov 2025 15:44:39 -0800 Subject: [PATCH 103/260] KVM: TDX: Fix sparse warnings from using 0 for NULL Stop using 0 for NULL. sparse moans: ... arch/x86/kvm/vmx/tdx.c:859:38: warning: Using plain integer as NULL pointer for several TDX pointer initializations. While I love a good ptr=0 now and then, it's good to have quiet sparse builds. Signed-off-by: Dave Hansen Fixes: a50f673f25e0 ("KVM: TDX: Do TDX specific vcpu initialization") Fixes: 8d032b683c29 ("KVM: TDX: create/destroy VM structure") Reviewed-by: Rick Edgecombe Cc: Xiaoyao Li Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Rick Edgecombe Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Xiaoyao Li Acked-by: Kiryl Shutsemau Link: https://patch.msgid.link/20251103234439.DC8227E4@davehans-spike.ostc.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 3525e0e9d073..0ffca14c1071 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -903,7 +903,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) } if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; } @@ -2581,7 +2581,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, free_tdr: if (tdr_page) __free_page(tdr_page); - kvm_tdx->td.tdr_page = 0; + kvm_tdx->td.tdr_page = NULL; free_hkid: tdx_hkid_free(kvm_tdx); @@ -3000,7 +3000,7 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) free_tdvpr: if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; return ret; From 11b79f8318aefc7ddfd12668fd1d80bde1c9f7bc Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 17 Oct 2025 23:39:14 +0200 Subject: [PATCH 104/260] KVM: TDX: Check size of user's kvm_tdx_capabilities array before allocating When userspace is getting TDX capabilities, retrieve and check the number of user entries before allocating kernel scratch space to avoid having to unwind the allocation if get_user() fails or if 'user_caps' is too small to fit 'caps'. Signed-off-by: Thorsten Blum Tested-by: Rick Edgecombe Link: https://patch.msgid.link/20251017213914.167301-1-thorsten.blum@linux.dev [sean: split to separate patch] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0ffca14c1071..cc751c088476 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2224,23 +2224,19 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; + user_caps = u64_to_user_ptr(cmd->data); + if (get_user(nr_user_entries, &user_caps->cpuid.nent)) + return -EFAULT; + + if (nr_user_entries < td_conf->num_cpuid_config) + return -E2BIG; + caps = kzalloc(sizeof(*caps) + sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, GFP_KERNEL); if (!caps) return -ENOMEM; - user_caps = u64_to_user_ptr(cmd->data); - if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { - ret = -EFAULT; - goto out; - } - - if (nr_user_entries < td_conf->num_cpuid_config) { - ret = -E2BIG; - goto out; - } - ret = init_kvm_tdx_caps(td_conf, caps); if (ret) goto out; From 398180f93cf3c7bb0ee3f512b139ad01843f3ddf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Nov 2025 13:24:53 -0800 Subject: [PATCH 105/260] KVM: TDX: Use struct_size to simplify tdx_get_capabilities() Use struct_size() instead of manually calculating the number of bytes to allocate for 'caps', including the nested flexible array, and copy all of 'caps' to user space with a single copy_to_user() call (thanks to the full size being provided by struct_size()). Signed-off-by: Thorsten Blum Tested-by: Rick Edgecombe Link: https://patch.msgid.link/20251017213914.167301-1-thorsten.blum@linux.dev [sean: separate from swap of get_user() vs. kzalloc() ordering] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index cc751c088476..a940a54ca593 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2231,9 +2231,8 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) if (nr_user_entries < td_conf->num_cpuid_config) return -E2BIG; - caps = kzalloc(sizeof(*caps) + - sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, - GFP_KERNEL); + caps = kzalloc(struct_size(caps, cpuid.entries, + td_conf->num_cpuid_config), GFP_KERNEL); if (!caps) return -ENOMEM; @@ -2241,16 +2240,12 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) if (ret) goto out; - if (copy_to_user(user_caps, caps, sizeof(*caps))) { + if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, + caps->cpuid.nent))) { ret = -EFAULT; goto out; } - if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, - caps->cpuid.nent * - sizeof(caps->cpuid.entries[0]))) - ret = -EFAULT; - out: /* kfree() accepts NULL. */ kfree(caps); From 4da3768e1820cf15cced390242d8789aed34f54d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 4 Nov 2025 09:55:26 -0800 Subject: [PATCH 106/260] KVM: SVM: Don't skip unrelated instruction if INT3/INTO is replaced When re-injecting a soft interrupt from an INT3, INT0, or (select) INTn instruction, discard the exception and retry the instruction if the code stream is changed (e.g. by a different vCPU) between when the CPU executes the instruction and when KVM decodes the instruction to get the next RIP. As effectively predicted by commit 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction"), failure to verify that the correct INTn instruction was decoded can effectively clobber guest state due to decoding the wrong instruction and thus specifying the wrong next RIP. The bug most often manifests as "Oops: int3" panics on static branch checks in Linux guests. Enabling or disabling a static branch in Linux uses the kernel's "text poke" code patching mechanism. To modify code while other CPUs may be executing that code, Linux (temporarily) replaces the first byte of the original instruction with an int3 (opcode 0xcc), then patches in the new code stream except for the first byte, and finally replaces the int3 with the first byte of the new code stream. If a CPU hits the int3, i.e. executes the code while it's being modified, then the guest kernel must look up the RIP to determine how to handle the #BP, e.g. by emulating the new instruction. If the RIP is incorrect, then this lookup fails and the guest kernel panics. The bug reproduces almost instantly by hacking the guest kernel to repeatedly check a static branch[1] while running a drgn script[2] on the host to constantly swap out the memory containing the guest's TSS. [1]: https://gist.github.com/osandov/44d17c51c28c0ac998ea0334edf90b5a [2]: https://gist.github.com/osandov/10e45e45afa29b11e0c7209247afc00b Fixes: 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction") Cc: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Omar Sandoval Link: https://patch.msgid.link/1cc6dcdf36e3add7ee7c8d90ad58414eeb6c3d34.1762278762.git.osandov@fb.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 9 +++++++++ arch/x86/kvm/svm/svm.c | 24 +++++++++++++----------- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..974d64bf0a4d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2143,6 +2143,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one. + * + * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip + * an instruction if it could generate a given software + * interrupt, which must be encoded via + * EMULTYPE_SET_SOFT_INT_VECTOR(). */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -2153,6 +2158,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8) +#define EMULTYPE_SKIP_SOFT_INT (1 << 9) + +#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16) +#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff) static inline bool kvm_can_emulate_event_vectoring(int emul_type) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1ae7b3c5a7c5..459db8154929 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + int emul_type, bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); @@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; - if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + if (!kvm_emulate_instruction(vcpu, emul_type)) return 0; if (unlikely(!commit_side_effects)) @@ -311,11 +312,13 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - return __svm_skip_emulated_instruction(vcpu, true); + return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } -static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector) { + const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT | + EMULTYPE_SET_SOFT_INT_VECTOR(vector); unsigned long rip, old_rip = kvm_rip_read(vcpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) * in use, the skip must not commit any side effects such as clearing * the interrupt shadow or RFLAGS.RF. */ - if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips)) return -EIO; rip = kvm_rip_read(vcpu); @@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu, ex); if (kvm_exception_is_soft(ex->vector) && - svm_update_soft_interrupt_rip(vcpu)) + svm_update_soft_interrupt_rip(vcpu, ex->vector)) return; svm->vmcb->control.event_inj = ex->vector @@ -3642,11 +3645,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { + struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt; struct vcpu_svm *svm = to_svm(vcpu); u32 type; - if (vcpu->arch.interrupt.soft) { - if (svm_update_soft_interrupt_rip(vcpu)) + if (intr->soft) { + if (svm_update_soft_interrupt_rip(vcpu, intr->nr)) return; type = SVM_EVTINJ_TYPE_SOFT; @@ -3654,12 +3658,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) type = SVM_EVTINJ_TYPE_INTR; } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr, - vcpu->arch.interrupt.soft, reinjected); + trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); ++vcpu->stat.irq_injections; - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | type; + svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42ecd093bb4c..8e14c0292809 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9338,6 +9338,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt, + int emulation_type) +{ + u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type); + + switch (ctxt->b) { + case 0xcc: + return vector == BP_VECTOR; + case 0xcd: + return vector == ctxt->src.val; + case 0xce: + return vector == OF_VECTOR; + default: + return false; + } +} + /* * Decode an instruction for emulation. The caller is responsible for handling * code breakpoints. Note, manually detecting code breakpoints is unnecessary @@ -9448,6 +9465,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { + if (emulation_type & EMULTYPE_SKIP_SOFT_INT && + !is_soft_int_instruction(ctxt, emulation_type)) + return 0; + if (ctxt->mode != X86EMUL_MODE_PROT64) ctxt->eip = (u32)ctxt->_eip; else From 9f4ce4878878cb9694c4284f7a483984d52d4d9a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Oct 2025 22:37:57 +0000 Subject: [PATCH 107/260] KVM: x86: Document a virtualization gap for GIF on AMD CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the APM Volume #2, Section 15.17, Table 15-10 (24593—Rev. 3.42—March 2024), When "GIF==0", an "Debug exception or trap, due to breakpoint register match" should be "Ignored and discarded". KVM lacks any handling of this. Even when vGIF is enabled and vGIF==0, the CPU does not ignore #DBs and relies on the VMM to do so. Handling this is possible, but the complexity is unjustified given the rarity of using HW breakpoints when GIF==0 (e.g. near VMRUN). KVM would need to intercept the #DB, temporarily disable the breakpoint, singe-step over the instruction (probably reusing NMI singe-stepping), and re-enable the breakpoint. Instead, document this as an erratum. Signed-off-by: Yosry Ahmed Reviewed-by: Bagas Sanjaya Link: https://patch.msgid.link/20251030223757.2950309-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/errata.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst index 37c79362a48f..a9cf0e004651 100644 --- a/Documentation/virt/kvm/x86/errata.rst +++ b/Documentation/virt/kvm/x86/errata.rst @@ -48,7 +48,14 @@ versus "has_error_code", i.e. KVM's ABI follows AMD behavior. Nested virtualization features ------------------------------ -TBD +On AMD CPUs, when GIF is cleared, #DB exceptions or traps due to a breakpoint +register match are ignored and discarded by the CPU. The CPU relies on the VMM +to fully virtualize this behavior, even when vGIF is enabled for the guest +(i.e. vGIF=0 does not cause the CPU to drop #DBs when the guest is running). +KVM does not virtualize this behavior as the complexity is unjustified given +the rarity of the use case. One way to handle this would be for KVM to +intercept the #DB, temporarily disable the breakpoint, single-step over the +instruction, then re-enable the breakpoint. x2APIC ------ From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:49 -0500 Subject: [PATCH 108/260] KVM: SEV: Consolidate the SEV policy bits in a single header file Consolidate SEV policy bit definitions into a single file. Use include/linux/psp-sev.h to hold the definitions and remove the current definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h files. No functional change intended. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 ++++------------ arch/x86/kvm/svm/svm.h | 3 --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0835c664fbfd..f04589ae76bb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,15 +65,7 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ -#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) -#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) -#define SNP_POLICY_MASK_SMT BIT_ULL(16) -#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) -#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) -#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) - -#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ SNP_POLICY_MASK_API_MAJOR | \ SNP_POLICY_MASK_SMT | \ SNP_POLICY_MASK_RSVD_MBO | \ @@ -2207,7 +2199,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~SNP_POLICY_MASK_VALID) + if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) return -EINVAL; /* Check for policy bits that must be set */ @@ -5085,10 +5077,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) /* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { - if (!(sev->policy & SNP_POLICY_DEBUG)) + if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) return NULL; } else { - if (sev->policy & SEV_POLICY_NODBG) + if (sev->policy & SEV_POLICY_MASK_NODBG) return NULL; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6765a5e433ce..a9f6c1ece63d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -117,9 +117,6 @@ struct kvm_sev_info { cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; -#define SEV_POLICY_NODBG BIT_ULL(0) -#define SNP_POLICY_DEBUG BIT_ULL(19) - struct kvm_svm { struct kvm kvm; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..27c92543bf38 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,25 @@ #include +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:50 -0500 Subject: [PATCH 109/260] crypto: ccp - Add an API to return the supported SEV-SNP policy bits Supported policy bits are dependent on the level of SEV firmware that is currently running. Create an API to return the supported policy bits for the current level of firmware. Signed-off-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- drivers/crypto/ccp/sev-dev.c | 37 ++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 18 ++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..db7c7c50cebc 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void) } EXPORT_SYMBOL_GPL(sev_platform_shutdown); +u64 sev_get_snp_policy_bits(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + u64 policy_bits; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + + policy_bits = SNP_POLICY_MASK_BASE; + + if (sev->snp_plat_status.feature_info) { + if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_RAPL_DIS; + + if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM; + + if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS; + + if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CXL_ALLOW; + + if (sev_version_greater_or_equal(1, 58)) + policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE; + } + + return policy_bits; +} +EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 27c92543bf38..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -32,6 +32,20 @@ #define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) #define SNP_POLICY_MASK_DEBUG BIT_ULL(19) #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ @@ -868,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ From 7a61d61396b97fd6bb9b9bde321c68513034ad11 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:51 -0500 Subject: [PATCH 110/260] KVM: SEV: Publish supported SEV-SNP policy bits Define the set of policy bits that KVM currently knows as not requiring any implementation support within KVM. Provide this value to userspace via the KVM_GET_DEVICE_ATTR ioctl. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/c596f7529518f3f826a57970029451d9385949e5.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/svm/sev.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d420c9c066d4..7ceff6583652 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -502,6 +502,7 @@ struct kvm_sync_regs { /* vendor-specific groups and attributes for system fd */ #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 +# define KVM_X86_SNP_POLICY_BITS 1 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f04589ae76bb..a425674fe993 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -72,6 +72,8 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 SNP_POLICY_MASK_DEBUG | \ SNP_POLICY_MASK_SINGLE_SOCKET) +static u64 snp_supported_policy_bits __ro_after_init; + #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; @@ -2135,6 +2137,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) *val = sev_supported_vmsa_features; return 0; + case KVM_X86_SNP_POLICY_BITS: + *val = snp_supported_policy_bits; + return 0; + default: return -ENXIO; } @@ -2199,7 +2205,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) + if (params.policy & ~snp_supported_policy_bits) return -EINVAL; /* Check for policy bits that must be set */ @@ -3092,8 +3098,11 @@ void __init sev_hardware_setup(void) else if (sev_snp_supported) sev_snp_supported = is_sev_snp_initialized(); - if (sev_snp_supported) + if (sev_snp_supported) { + snp_supported_policy_bits = sev_get_snp_policy_bits() & + KVM_SNP_POLICY_MASK_VALID; nr_ciphertext_hiding_asids = init_args.max_snp_asid; + } /* * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP From 275d6d1189e6d5f8e7c1da43ffd4b09d7089f174 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:52 -0500 Subject: [PATCH 111/260] KVM: SEV: Add known supported SEV-SNP policy bits Add to the known supported SEV-SNP policy bits that don't require any implementation support from KVM in order to successfully use them. At this time, this includes: - CXL_ALLOW - MEM_AES_256_XTS - RAPL_DIS - CIPHERTEXT_HIDING_DRAM - PAGE_SWAP_DISABLE Arguably, RAPL_DIS and CIPHERTEXT_HIDING_DRAM require KVM and the CCP driver to enable these features in order for the setting of the policy bits to be successfully handled. But, a guest owner may not wish their guest to run on a system that doesn't provide support for those features, so allowing the specification of these bits accomplishes that. Whether or not the bit is supported by SEV firmware, a system that doesn't support these features will either fail during the KVM validation of supported policy bits before issuing the LAUNCH_START or fail during the LAUNCH_START. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/ec040de9864099cf592a97c201dc4cc110b2b0cf.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a425674fe993..f59c65abe3cf 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,12 +65,22 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ - SNP_POLICY_MASK_API_MAJOR | \ - SNP_POLICY_MASK_SMT | \ - SNP_POLICY_MASK_RSVD_MBO | \ - SNP_POLICY_MASK_DEBUG | \ - SNP_POLICY_MASK_SINGLE_SOCKET) +/* + * SEV-SNP policy bits that can be supported by KVM. These include policy bits + * that have implementation support within KVM or policy bits that do not + * require implementation support within KVM to enforce the policy. + */ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET | \ + SNP_POLICY_MASK_CXL_ALLOW | \ + SNP_POLICY_MASK_MEM_AES_256_XTS | \ + SNP_POLICY_MASK_RAPL_DIS | \ + SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ + SNP_POLICY_MASK_PAGE_SWAP_DISABLE) static u64 snp_supported_policy_bits __ro_after_init; From b3e5b670c94968f237ce3f93d1345dfe41f54c15 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Nov 2025 13:02:06 -0800 Subject: [PATCH 112/260] KVM: x86: Use "checked" versions of get_user() and put_user() Use the normal, checked versions for get_user() and put_user() instead of the double-underscore versions that omit range checks, as the checked versions are actually measurably faster on modern CPUs (12%+ on Intel, 25%+ on AMD). The performance hit on the unchecked versions is almost entirely due to the added LFENCE on CPUs where LFENCE is serializing (which is effectively all modern CPUs), which was added by commit 304ec1b05031 ("x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec"). The small optimizations done by commit b19b74bc99b1 ("x86/mm: Rework address range check in get_user() and put_user()") likely shave a few cycles off, but the bulk of the extra latency comes from the LFENCE. Don't bother trying to open-code an equivalent for performance reasons, as the loss of inlining (e.g. see commit ea6f043fc984 ("x86: Make __get_user() generate an out-of-line call") is largely a non-factor (ignoring setups where RET is something entirely different), As measured across tens of millions of calls of guest PTE reads in FNAME(walk_addr_generic): __get_user() get_user() open-coded open-coded, no LFENCE Intel (EMR) 75.1 67.6 75.3 65.5 AMD (Turin) 68.1 51.1 67.5 49.3 Note, Hyper-V MSR emulation is not a remotely hot path, but convert it anyways for consistency, and because there is a general desire to remove __{get,put}_user() entirely. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/all/CAHk-=wimh_3jM9Xe8Zx0rpuf8CPDu6DkRCGb44azk0Sz5yqSnw@mail.gmail.com Cc: Borislav Petkov Link: https://patch.msgid.link/20251106210206.221558-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 38595ecb990d..de92292eb1f5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1568,7 +1568,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ - if (__put_user(0, (u32 __user *)addr)) + if (put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ed762bb4b007..901cd2bd40b8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -402,7 +402,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__get_user(pte, ptep_user))) + if (unlikely(get_user(pte, ptep_user))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; From 0ea9494be9c931ddbc084ad5e11fda91b554cf47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 12:51:11 -0800 Subject: [PATCH 113/260] KVM: x86: WARN if hrtimer callback for periodic APIC timer fires with period=0 WARN and don't restart the hrtimer if KVM's callback runs with the guest's APIC timer in periodic mode but with a period of '0', as not advancing the hrtimer's deadline would put the CPU into an infinite loop of hrtimer events. Observing a period of '0' should be impossible, even when the hrtimer is running on a different CPU than the vCPU, as KVM is supposed to cancel the hrtimer before changing (or zeroing) the period, e.g. when switching from periodic to one-shot. Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251113205114.1647493-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0ae7f913d782..78b74ba17592 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2970,7 +2970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic, true); - if (lapic_is_periodic(apic)) { + if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; From 9633f180ce994ab293ce4924a9b7aaf4673aa114 Mon Sep 17 00:00:00 2001 From: fuqiang wang Date: Thu, 13 Nov 2025 12:51:12 -0800 Subject: [PATCH 114/260] KVM: x86: Explicitly set new periodic hrtimer expiration in apic_timer_fn() When restarting an hrtimer to emulate a the guest's APIC timer in periodic mode, explicitly set the expiration using the target expiration computed by advance_periodic_target_expiration() instead of adding the period to the existing timer. This will allow making adjustments to the expiration, e.g. to deal with expirations far in the past, without having to implement the same logic in both advance_periodic_target_expiration() and apic_timer_fn(). Cc: stable@vger.kernel.org Signed-off-by: fuqiang wang [sean: split to separate patch, write changelog] Link: https://patch.msgid.link/20251113205114.1647493-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 78b74ba17592..a5c927e7bae6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2972,7 +2972,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { advance_periodic_target_expiration(apic); - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; From 18ab3fc8e880791aa9f7c000261320fc812b5465 Mon Sep 17 00:00:00 2001 From: fuqiang wang Date: Thu, 13 Nov 2025 12:51:13 -0800 Subject: [PATCH 115/260] KVM: x86: Fix VM hard lockup after prolonged inactivity with periodic HV timer When advancing the target expiration for the guest's APIC timer in periodic mode, set the expiration to "now" if the target expiration is in the past (similar to what is done in update_target_expiration()). Blindly adding the period to the previous target expiration can result in KVM generating a practically unbounded number of hrtimer IRQs due to programming an expired timer over and over. In extreme scenarios, e.g. if userspace pauses/suspends a VM for an extended duration, this can even cause hard lockups in the host. Currently, the bug only affects Intel CPUs when using the hypervisor timer (HV timer), a.k.a. the VMX preemption timer. Unlike the software timer, a.k.a. hrtimer, which KVM keeps running even on exits to userspace, the HV timer only runs while the guest is active. As a result, if the vCPU does not run for an extended duration, there will be a huge gap between the target expiration and the current time the vCPU resumes running. Because the target expiration is incremented by only one period on each timer expiration, this leads to a series of timer expirations occurring rapidly after the vCPU/VM resumes. More critically, when the vCPU first triggers a periodic HV timer expiration after resuming, advancing the expiration by only one period will result in a target expiration in the past. As a result, the delta may be calculated as a negative value. When the delta is converted into an absolute value (tscdeadline is an unsigned u64), the resulting value can overflow what the HV timer is capable of programming. I.e. the large value will exceed the VMX Preemption Timer's maximum bit width of cpu_preemption_timer_multi + 32, and thus cause KVM to switch from the HV timer to the software timer (hrtimers). After switching to the software timer, periodic timer expiration callbacks may be executed consecutively within a single clock interrupt handler, because hrtimers honors KVM's request for an expiration in the past and immediately re-invokes KVM's callback after reprogramming. And because the interrupt handler runs with IRQs disabled, restarting KVM's hrtimer over and over until the target expiration is advanced to "now" can result in a hard lockup. E.g. the following hard lockup was triggered in the host when running a Windows VM (only relevant because it used the APIC timer in periodic mode) after resuming the VM from a long suspend (in the host). NMI watchdog: Watchdog detected hard LOCKUP on cpu 45 ... RIP: 0010:advance_periodic_target_expiration+0x4d/0x80 [kvm] ... RSP: 0018:ff4f88f5d98d8ef0 EFLAGS: 00000046 RAX: fff0103f91be678e RBX: fff0103f91be678e RCX: 00843a7d9e127bcc RDX: 0000000000000002 RSI: 0052ca4003697505 RDI: ff440d5bfbdbd500 RBP: ff440d5956f99200 R08: ff2ff2a42deb6a84 R09: 000000000002a6c0 R10: 0122d794016332b3 R11: 0000000000000000 R12: ff440db1af39cfc0 R13: ff440db1af39cfc0 R14: ffffffffc0d4a560 R15: ff440db1af39d0f8 FS: 00007f04a6ffd700(0000) GS:ff440db1af380000(0000) knlGS:000000e38a3b8000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000d5651feff8 CR3: 000000684e038002 CR4: 0000000000773ee0 PKRU: 55555554 Call Trace: apic_timer_fn+0x31/0x50 [kvm] __hrtimer_run_queues+0x100/0x280 hrtimer_interrupt+0x100/0x210 ? ttwu_do_wakeup+0x19/0x160 smp_apic_timer_interrupt+0x6a/0x130 apic_timer_interrupt+0xf/0x20 Moreover, if the suspend duration of the virtual machine is not long enough to trigger a hard lockup in this scenario, since commit 98c25ead5eda ("KVM: VMX: Move preemption timer <=> hrtimer dance to common x86"), KVM will continue using the software timer until the guest reprograms the APIC timer in some way. Since the periodic timer does not require frequent APIC timer register programming, the guest may continue to use the software timer in perpetuity. Fixes: d8f2f498d9ed ("x86/kvm: fix LAPIC timer drift when guest uses periodic mode") Cc: stable@vger.kernel.org Signed-off-by: fuqiang wang [sean: massage comments and changelog] Link: https://patch.msgid.link/20251113205114.1647493-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a5c927e7bae6..8b6ec3304100 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2131,15 +2131,33 @@ static void advance_periodic_target_expiration(struct kvm_lapic *apic) ktime_t delta; /* - * Synchronize both deadlines to the same time source or - * differences in the periods (caused by differences in the - * underlying clocks or numerical approximation errors) will - * cause the two to drift apart over time as the errors - * accumulate. + * Use kernel time as the time source for both the hrtimer deadline and + * TSC-based deadline so that they stay synchronized. Computing each + * deadline independently will cause the two deadlines to drift apart + * over time as differences in the periods accumulate, e.g. due to + * differences in the underlying clocks or numerical approximation errors. */ apic->lapic_timer.target_expiration = ktime_add_ns(apic->lapic_timer.target_expiration, apic->lapic_timer.period); + + /* + * If the new expiration is in the past, e.g. because userspace stopped + * running the VM for an extended duration, then force the expiration + * to "now" and don't try to play catch-up with the missed events. KVM + * will only deliver a single interrupt regardless of how many events + * are pending, i.e. restarting the timer with an expiration in the + * past will do nothing more than waste host cycles, and can even lead + * to a hard lockup in extreme cases. + */ + if (ktime_before(apic->lapic_timer.target_expiration, now)) + apic->lapic_timer.target_expiration = now; + + /* + * Note, ensuring the expiration isn't in the past also prevents delta + * from going negative, which could cause the TSC deadline to become + * excessively large due to it an unsigned value. + */ delta = ktime_sub(apic->lapic_timer.target_expiration, now); apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, delta); From a091fe60c2d3943b058132a64682a509d55bd325 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 12:51:14 -0800 Subject: [PATCH 116/260] KVM: x86: Grab lapic_timer in a local variable to cleanup periodic code Stash apic->lapic_timer in a local "ktimer" variable in advance_periodic_target_expiration() to eliminate a few unaligned wraps, and to make the code easier to read overall. No functional change intended. Link: https://patch.msgid.link/20251113205114.1647493-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8b6ec3304100..1597dd0b0cc6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2126,6 +2126,7 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) static void advance_periodic_target_expiration(struct kvm_lapic *apic) { + struct kvm_timer *ktimer = &apic->lapic_timer; ktime_t now = ktime_get(); u64 tscl = rdtsc(); ktime_t delta; @@ -2137,9 +2138,8 @@ static void advance_periodic_target_expiration(struct kvm_lapic *apic) * over time as differences in the periods accumulate, e.g. due to * differences in the underlying clocks or numerical approximation errors. */ - apic->lapic_timer.target_expiration = - ktime_add_ns(apic->lapic_timer.target_expiration, - apic->lapic_timer.period); + ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration, + ktimer->period); /* * If the new expiration is in the past, e.g. because userspace stopped @@ -2150,17 +2150,17 @@ static void advance_periodic_target_expiration(struct kvm_lapic *apic) * past will do nothing more than waste host cycles, and can even lead * to a hard lockup in extreme cases. */ - if (ktime_before(apic->lapic_timer.target_expiration, now)) - apic->lapic_timer.target_expiration = now; + if (ktime_before(ktimer->target_expiration, now)) + ktimer->target_expiration = now; /* * Note, ensuring the expiration isn't in the past also prevents delta * from going negative, which could cause the TSC deadline to become * excessively large due to it an unsigned value. */ - delta = ktime_sub(apic->lapic_timer.target_expiration, now); - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + - nsec_to_cycles(apic->vcpu, delta); + delta = ktime_sub(ktimer->target_expiration, now); + ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) From 43ddbf16edf5c1790684b32d5eb920a1b0eea285 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Tue, 19 Aug 2025 23:20:25 +0800 Subject: [PATCH 117/260] Revert "x86: kvm: introduce periodic global clock updates" This reverts commit 332967a3eac06f6379283cf155c84fe7cd0537c2. Commit 332967a3eac0 ("x86: kvm: introduce periodic global clock updates") introduced a 300s interval work to sync ntp corrections across all vcpus. Since commit 53fafdbb8b21 ("KVM: x86: switch KVMCLOCK base to monotonic raw clock"), kvmclock switched to mono raw clock, we can no longer take ntp into consideration. Signed-off-by: Lei Chen Link: https://patch.msgid.link/20250819152027.1687487-2-lei.chen@smartx.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 25 ------------------------- 2 files changed, 26 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a557c504c1a4..adc8ea9d391b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1457,7 +1457,6 @@ struct kvm_arch { u64 master_kernel_ns; u64 master_cycle_now; struct delayed_work kvmclock_update_work; - struct delayed_work kvmclock_sync_work; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 97852f190720..6c16067046bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -159,9 +159,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, 0444); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); @@ -3558,20 +3555,6 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) KVMCLOCK_UPDATE_DELAY); } -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); -} - /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) { @@ -12757,8 +12740,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); @@ -12769,10 +12750,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.msr_kvm_poll_control = 1; mutex_unlock(&vcpu->mutex); - - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -13187,7 +13164,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) #endif INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); @@ -13295,7 +13271,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) * is unsafe, i.e. will lead to use-after-free. The PIT also needs to * be stopped before IRQ routing is freed. */ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); #ifdef CONFIG_KVM_IOAPIC From 446fcce2a52b533c543dabba26777813c347577c Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Tue, 19 Aug 2025 23:20:26 +0800 Subject: [PATCH 118/260] Revert "x86: kvm: rate-limit global clock updates" This reverts commit 7e44e4495a398eb553ce561f29f9148f40a3448f. Commit 7e44e4495a39 ("x86: kvm: rate-limit global clock updates") intends to use a kvmclock_update_work to sync ntp corretion across all vcpus kvmclock, which is based on commit 0061d53daf26f ("KVM: x86: limit difference between kvmclock updates") Since kvmclock has been switched to mono raw, this commit can be reverted. Signed-off-by: Lei Chen Link: https://patch.msgid.link/20250819152027.1687487-3-lei.chen@smartx.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 29 ++++------------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index adc8ea9d391b..692ec922945f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1456,7 +1456,6 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; - struct delayed_work kvmclock_update_work; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c16067046bd..33fcd79e33da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3523,22 +3523,14 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { unsigned long i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; + struct kvm *kvm = v->kvm; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -3546,15 +3538,6 @@ static void kvmclock_update_fn(struct work_struct *work) } } -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) -{ - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); -} - /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) { @@ -13163,8 +13146,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.hv_root_tdp = INVALID_PAGE; #endif - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); @@ -13271,8 +13252,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) * is unsafe, i.e. will lead to use-after-free. The PIT also needs to * be stopped before IRQ routing is freed. */ - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - #ifdef CONFIG_KVM_IOAPIC kvm_free_pit(kvm); #endif From e78fb96b41c6ac85c1a02c7e9610d1ebaa9b5d98 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Tue, 19 Aug 2025 23:20:27 +0800 Subject: [PATCH 119/260] KVM: x86: remove comment about ntp correction sync for Since vcpu local clock is no longer affected by ntp, remove comment about ntp correction sync for function kvm_gen_kvmclock_update. Signed-off-by: Lei Chen Link: https://patch.msgid.link/20250819152027.1687487-4-lei.chen@smartx.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33fcd79e33da..03b2121069c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3518,9 +3518,7 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. + * the rest of the vcpus to remain static. * * So in those cases, request a kvmclock update for all vcpus. * The worst case for a remote vcpu to update its kvmclock From 11d984633f7f3fc97dbbc551fe15429adf10a8d5 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 7 Nov 2025 17:36:01 -0800 Subject: [PATCH 120/260] KVM: x86: Allocate/free user_return_msrs at kvm.ko (un)loading time Move user_return_msrs allocation/free from vendor modules (kvm-intel.ko and kvm-amd.ko) (un)loading time to kvm.ko's to make it less risky to access user_return_msrs in kvm.ko. Tying the lifetime of user_return_msrs to vendor modules makes every access to user_return_msrs prone to use-after-free issues as vendor modules may be unloaded at any time. Opportunistically turn the per-CPU variable into full structs, as there's no practical difference between statically allocating the memory and allocating it unconditionally during module_init(). Zero out kvm_nr_uret_msrs on vendor module exit to further minimize the chances of consuming stale data, and WARN on vendor module load if KVM thinks there are existing user-return MSRs. Note! The user-return MSRs also need to be "destroyed" if ops->hardware_setup() fails, as both SVM and VMX expect common KVM to clean up (because common code, not vendor code, is responsible for kvm_nr_uret_msrs). Signed-off-by: Chao Gao Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251108013601.902918-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03b2121069c4..be737d9645b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -209,7 +209,7 @@ struct kvm_user_return_msrs { u32 __read_mostly kvm_nr_uret_msrs; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; -static struct kvm_user_return_msrs __percpu *user_return_msrs; +static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -572,25 +572,14 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } -static int kvm_init_user_return_msrs(void) -{ - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); - if (!user_return_msrs) { - pr_err("failed to allocate percpu user_return_msrs\n"); - return -ENOMEM; - } - kvm_nr_uret_msrs = 0; - return 0; -} - -static void kvm_free_user_return_msrs(void) +static void kvm_destroy_user_return_msrs(void) { int cpu; for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu_ptr(user_return_msrs, cpu)->registered); + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); - free_percpu(user_return_msrs); + kvm_nr_uret_msrs = 0; } static void kvm_on_user_return(struct user_return_notifier *urn) @@ -653,7 +642,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); u64 value; int i; @@ -675,7 +664,7 @@ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); @@ -693,13 +682,13 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); u64 kvm_get_user_return_msr(unsigned int slot) { - return this_cpu_ptr(user_return_msrs)->values[slot].curr; + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); static void drop_user_return_notifiers(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); @@ -10022,13 +10011,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -ENOMEM; } - r = kvm_init_user_return_msrs(); - if (r) - goto out_free_x86_emulator_cache; - r = kvm_mmu_vendor_module_init(); if (r) - goto out_free_percpu; + goto out_free_x86_emulator_cache; kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; @@ -10053,6 +10038,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + WARN_ON_ONCE(kvm_nr_uret_msrs); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; @@ -10125,9 +10112,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); -out_free_percpu: - kvm_free_user_return_msrs(); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; @@ -10155,8 +10141,8 @@ void kvm_x86_vendor_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_call(hardware_unsetup)(); + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); - kvm_free_user_return_msrs(); kmem_cache_destroy(x86_emulator_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); From 844afc1af3a9d98dc8c246855c3aac410be36bef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:38 -0800 Subject: [PATCH 121/260] KVM: VMX: Use on-stack copy of @flags in __vmx_vcpu_run() When testing for VMLAUNCH vs. VMRESUME, use the copy of @flags from the stack instead of first moving it to EBX, and then propagating VMX_RUN_VMRESUME to RFLAGS.CF (because RBX is clobbered with the guest value prior to the conditional branch to VMLAUNCH). Stashing information in RFLAGS is gross, especially with the writer and reader being bifurcated by yet more gnarly assembly code. Opportunistically drop the SHIFT macros as they existed purely to allow the VM-Enter flow to use Bit Test. Suggested-by: Borislav Petkov Acked-by: Borislav Petkov (AMD) Reviewed-by: Brendan Jackman Link: https://patch.msgid.link/20251113233746.1703361-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/run_flags.h | 10 +++------- arch/x86/kvm/vmx/vmenter.S | 13 ++++--------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index 2f20fb170def..6a87a12135fb 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,12 +2,8 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME_SHIFT 0 -#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 -#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2 - -#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) -#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) -#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT) +#define VMX_RUN_VMRESUME BIT(0) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(1) +#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bc255d709d8a..b59062a52a35 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -92,7 +92,7 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Save @vmx for SPEC_CTRL handling */ push %_ASM_ARG1 - /* Save @flags for SPEC_CTRL handling */ + /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */ push %_ASM_ARG3 /* @@ -101,9 +101,6 @@ SYM_FUNC_START(__vmx_vcpu_run) */ push %_ASM_ARG2 - /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ - mov %_ASM_ARG3L, %ebx - lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp @@ -137,9 +134,6 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load @regs to RAX. */ mov (%_ASM_SP), %_ASM_AX - /* Check if vmlaunch or vmresume is needed */ - bt $VMX_RUN_VMRESUME_SHIFT, %ebx - /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX mov VCPU_RDX(%_ASM_AX), %_ASM_DX @@ -163,8 +157,9 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Clobbers EFLAGS.ZF */ CLEAR_CPU_BUFFERS - /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ - jnc .Lvmlaunch + /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ + testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) + jz .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" From aba7de6088be5a3b5d766c5f7fdb5d0790ff8f13 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 13 Nov 2025 15:37:39 -0800 Subject: [PATCH 122/260] x86/bugs: Use VM_CLEAR_CPU_BUFFERS in VMX as well TSA mitigation: d8010d4ba43e ("x86/bugs: Add a Transient Scheduler Attacks mitigation") introduced VM_CLEAR_CPU_BUFFERS for guests on AMD CPUs. Currently on Intel CLEAR_CPU_BUFFERS is being used for guests which has a much broader scope (kernel->user also). Make mitigations on Intel consistent with TSA. This would help handling the guest-only mitigations better in future. Signed-off-by: Pawan Gupta [sean: make CLEAR_CPU_BUF_VM mutually exclusive with the MMIO mitigation] Acked-by: Borislav Petkov (AMD) Reviewed-by: Brendan Jackman Link: https://patch.msgid.link/20251113233746.1703361-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/cpu/bugs.c | 13 +++++++++---- arch/x86/kvm/vmx/vmenter.S | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6a526ae1fe99..2847e11fbab5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -194,7 +194,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); /* * Controls CPU Fill buffer clear before VMenter. This is a subset of - * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * X86_FEATURE_CLEAR_CPU_BUF_VM, and should only be enabled when KVM-only * mitigation is required. */ DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); @@ -489,8 +489,8 @@ static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; /* - * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing - * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to + * userspace *and* on entry to KVM guests. */ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; @@ -536,6 +536,7 @@ static void __init mds_apply_mitigation(void) if (mds_mitigation == MDS_MITIGATION_FULL || mds_mitigation == MDS_MITIGATION_VMWERV) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); @@ -647,6 +648,7 @@ static void __init taa_apply_mitigation(void) * present on host, enable the mitigation for UCODE_NEEDED as well. */ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) cpu_smt_disable(false); @@ -748,6 +750,7 @@ static void __init mmio_apply_mitigation(void) */ if (verw_clear_cpu_buf_mitigation_selected) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); static_branch_disable(&cpu_buf_vm_clear); } else { static_branch_enable(&cpu_buf_vm_clear); @@ -839,8 +842,10 @@ static void __init rfds_update_mitigation(void) static void __init rfds_apply_mitigation(void) { - if (rfds_mitigation == RFDS_MITIGATION_VERW) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + } } static __init int rfds_parse_cmdline(char *str) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index b59062a52a35..b66b43c9b244 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -155,7 +155,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Clobbers EFLAGS.ZF */ - CLEAR_CPU_BUFFERS + VM_CLEAR_CPU_BUFFERS /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) From afb99ffbd5825bfbc88e8052bc5514e6ebdfec5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:40 -0800 Subject: [PATCH 123/260] x86/bugs: Decouple ALTERNATIVE usage from VERW macro definition Decouple the use of ALTERNATIVE from the encoding of VERW to clear CPU buffers so that KVM can use ALTERNATIVE_2 to handle "always clear buffers" and "clear if guest can access host MMIO" in a single statement. No functional change intended. Reviewed-by: Brendan Jackman Reviewed-by: Pawan Gupta Link: https://patch.msgid.link/20251113233746.1703361-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/nospec-branch.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 08ed5a2e46a5..cb36a8ea00d3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -308,24 +308,29 @@ * CFLAGS.ZF. * Note: Only the memory operand variant of VERW clears the CPU buffers. */ -.macro __CLEAR_CPU_BUFFERS feature #ifdef CONFIG_X86_64 - ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature +#define VERW verw x86_verw_sel(%rip) #else - /* - * In 32bit mode, the memory operand must be a %cs reference. The data - * segments may not be usable (vm86 mode), and the stack segment may not - * be flat (ESPFIX32). - */ - ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature +/* + * In 32bit mode, the memory operand must be a %cs reference. The data segments + * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32). + */ +#define VERW verw %cs:x86_verw_sel #endif -.endm +/* + * Provide a stringified VERW macro for simple usage, and a non-stringified + * VERW macro for use in more elaborate sequences, e.g. to encode a conditional + * VERW within an ALTERNATIVE. + */ +#define __CLEAR_CPU_BUFFERS __stringify(VERW) + +/* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */ #define CLEAR_CPU_BUFFERS \ - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF #define VM_CLEAR_CPU_BUFFERS \ - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM #ifdef CONFIG_X86_64 .macro CLEAR_BRANCH_HISTORY From f6106d41ec84e552a5e8adda1f8741cab96a5425 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:41 -0800 Subject: [PATCH 124/260] x86/bugs: Use an x86 feature to track the MMIO Stale Data mitigation Convert the MMIO Stale Data mitigation tracking from a static branch into an x86 feature flag so that it can be used via ALTERNATIVE_2 in KVM. No functional change intended. Reviewed-by: Pawan Gupta Reviewed-by: Brendan Jackman Link: https://patch.msgid.link/20251113233746.1703361-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 5 +++++ arch/x86/include/asm/nospec-branch.h | 2 -- arch/x86/kernel/cpu/bugs.c | 11 +---------- arch/x86/kvm/mmu/spte.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- 5 files changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..fc5698844a0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -499,6 +499,11 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+17) /* + * Clear CPU buffers before VM-Enter if the vCPU + * can access host MMIO (ignored for all intents + * and purposes if CLEAR_CPU_BUF_VM is set). + */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cb36a8ea00d3..afdcdf40e414 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -585,8 +585,6 @@ DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); - extern u16 x86_verw_sel; #include diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2847e11fbab5..8391a20fe5a8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -192,14 +192,6 @@ EXPORT_SYMBOL_GPL(cpu_buf_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* - * Controls CPU Fill buffer clear before VMenter. This is a subset of - * X86_FEATURE_CLEAR_CPU_BUF_VM, and should only be enabled when KVM-only - * mitigation is required. - */ -DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); -EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); - #undef pr_fmt #define pr_fmt(fmt) "mitigations: " fmt @@ -751,9 +743,8 @@ static void __init mmio_apply_mitigation(void) if (verw_clear_cpu_buf_mitigation_selected) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); - static_branch_disable(&cpu_buf_vm_clear); } else { - static_branch_enable(&cpu_buf_vm_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO); } /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 37647afde7d3..85a0473809b0 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -292,7 +292,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } - if (static_branch_unlikely(&cpu_buf_vm_clear) && + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && !kvm_vcpu_can_access_host_mmio(vcpu) && kvm_is_mmio_pfn(pfn, &is_host_mmio)) kvm_track_host_mmio_mapping(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d98107a7bdaa..67702609f68e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -903,7 +903,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; - if (static_branch_unlikely(&cpu_buf_vm_clear) && + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; @@ -7325,7 +7325,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&cpu_buf_vm_clear) && + else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) x86_clear_cpu_buffers(); From e6ff1d61de51ec5fe94c5fb79544a93f494104eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:42 -0800 Subject: [PATCH 125/260] KVM: VMX: Handle MMIO Stale Data in VM-Enter assembly via ALTERNATIVES_2 Rework the handling of the MMIO Stale Data mitigation to clear CPU buffers immediately prior to VM-Enter, i.e. in the same location that KVM emits a VERW for unconditional (at runtime) clearing. Co-locating the code and using a single ALTERNATIVES_2 makes it more obvious how VMX mitigates the various vulnerabilities. Deliberately order the alternatives as: 0. Do nothing 1. Clear if vCPU can access MMIO 2. Clear always since the last alternative wins in ALTERNATIVES_2(), i.e. so that KVM will honor the strictest mitigation (always clear CPU buffers) if multiple mitigations are selected. E.g. even if the kernel chooses to mitigate MMIO Stale Data via X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, another mitigation may enable X86_FEATURE_CLEAR_CPU_BUF_VM, and that other thing needs to win. Note, decoupling the MMIO mitigation from the L1TF mitigation also fixes a mostly-benign flaw where KVM wouldn't do any clearing/flushing if the L1TF mitigation is configured to conditionally flush the L1D, and the MMIO mitigation but not any other "clear CPU buffers" mitigation is enabled. For that specific scenario, KVM would skip clearing CPU buffers for the MMIO mitigation even though the kernel requested a clear on every VM-Enter. Note #2, the flaw goes back to the introduction of the MDS mitigation. The MDS mitigation was inadvertently fixed by commit 43fb862de8f6 ("KVM/VMX: Move VERW closer to VMentry for MDS mitigation"), but previous kernels that flush CPU buffers in vmx_vcpu_enter_exit() are affected (though it's unlikely the flaw is meaningfully exploitable even older kernels). Fixes: 650b68a0622f ("x86/kvm/vmx: Add MDS protection when L1D Flush is not active") Suggested-by: Pawan Gupta Reviewed-by: Pawan Gupta Reviewed-by: Brendan Jackman Link: https://patch.msgid.link/20251113233746.1703361-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 16 ++++++++++++++-- arch/x86/kvm/vmx/vmx.c | 13 ------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index b66b43c9b244..909be7bbe5bc 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -71,6 +71,7 @@ * @regs: unsigned long * (to guest registers) * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl + * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO * * Returns: * 0 on VM-Exit, 1 on VM-Fail @@ -154,8 +155,19 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Clobbers EFLAGS.ZF */ - VM_CLEAR_CPU_BUFFERS + /* + * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is + * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled, + * check @flags to see if the vCPU has access to host MMIO, and if so, + * do VERW. Else, do nothing (no mitigations needed/enabled). + */ + ALTERNATIVE_2 "", \ + __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \ + jz .Lskip_mmio_verw; \ + VERW; \ + .Lskip_mmio_verw:), \ + X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \ + __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 67702609f68e..4bb5408e68bf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7313,21 +7313,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* - * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW - * mitigation for MDS is done late in VMentry and is still - * executed in spite of L1D Flush. This is because an extra VERW - * should not matter much after the big hammer L1D Flush. - * - * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, - * and is affected by MMIO Stale Data. In such cases mitigation in only - * needed against an MMIO capable guest. - */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && - (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) - x86_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); From fc704b578976ed6a937f419e611d5ae92c939826 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:43 -0800 Subject: [PATCH 126/260] x86/bugs: KVM: Move VM_CLEAR_CPU_BUFFERS into SVM as SVM_CLEAR_CPU_BUFFERS Now that VMX encodes its own sequence for clearing CPU buffers, move VM_CLEAR_CPU_BUFFERS into SVM to minimize the chances of KVM botching a mitigation in the future, e.g. using VM_CLEAR_CPU_BUFFERS instead of checking multiple mitigation flags. No functional change intended. Reviewed-by: Brendan Jackman Acked-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20251113233746.1703361-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/nospec-branch.h | 3 --- arch/x86/kvm/svm/vmenter.S | 6 ++++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index afdcdf40e414..a6526c5be5ca 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -329,9 +329,6 @@ #define CLEAR_CPU_BUFFERS \ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF -#define VM_CLEAR_CPU_BUFFERS \ - ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM - #ifdef CONFIG_X86_64 .macro CLEAR_BRANCH_HISTORY ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 235c4af6b692..da5f481cb17e 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -92,6 +92,8 @@ jmp 901b .endm +#define SVM_CLEAR_CPU_BUFFERS \ + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode @@ -170,7 +172,7 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Clobbers EFLAGS.ZF */ - VM_CLEAR_CPU_BUFFERS + SVM_CLEAR_CPU_BUFFERS /* Enter guest mode */ 3: vmrun %_ASM_AX @@ -339,7 +341,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Clobbers EFLAGS.ZF */ - VM_CLEAR_CPU_BUFFERS + SVM_CLEAR_CPU_BUFFERS /* Enter guest mode */ 1: vmrun %rax From 0abd9610d6c6996317262f1712c959d74ed08de6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:44 -0800 Subject: [PATCH 127/260] KVM: VMX: Bundle all L1 data cache flush mitigation code together Move vmx_l1d_flush(), vmx_cleanup_l1d_flush(), and the vmentry_l1d_flush param code up in vmx.c so that all of the L1 data cache flushing code is bundled together. This will allow conditioning the mitigation code on CONFIG_CPU_MITIGATIONS=y with minimal #ifdefs. No functional change intended. Reviewed-by: Brendan Jackman Reviewed-by: Pawan Gupta Link: https://patch.msgid.link/20251113233746.1703361-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 174 ++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bb5408e68bf..d4de7ca2bc4b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -302,6 +302,16 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + static int vmentry_l1d_flush_parse(const char *s) { unsigned int i; @@ -352,6 +362,83 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; @@ -404,12 +491,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vmx->disable_fb_clear = false; } -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); @@ -6646,77 +6727,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return ret; } -/* - * Software based L1D cache flush which is used when microcode providing - * the cache control MSR is not loaded. - * - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to - * flush it is required to read in 64 KiB because the replacement algorithm - * is not exactly LRU. This could be sized at runtime via topology - * information but as all relevant affected CPUs have 32KiB L1D cache size - * there is no point in doing so. - */ -static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) -{ - int size = PAGE_SIZE << L1D_CACHE_ORDER; - - /* - * This code is only executed when the flush mode is 'cond' or - * 'always' - */ - if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - - /* - * Clear the per-vcpu flush bit, it gets set again if the vCPU - * is reloaded, i.e. if the vCPU is scheduled out or if KVM - * exits to userspace, or if KVM reaches one of the unsafe - * VMEXIT handlers, e.g. if KVM calls into the emulator. - */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) - return; - } - - vcpu->stat.l1d_flush++; - - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - return; - } - - asm volatile( - /* First ensure the pages are in the TLB */ - "xorl %%eax, %%eax\n" - ".Lpopulate_tlb:\n\t" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $4096, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lpopulate_tlb\n\t" - "xorl %%eax, %%eax\n\t" - "cpuid\n\t" - /* Now fill the cache */ - "xorl %%eax, %%eax\n" - ".Lfill_cache:\n" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $64, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lfill_cache\n\t" - "lfence\n" - :: [flush_pages] "r" (vmx_l1d_flush_pages), - [size] "r" (size) - : "eax", "ebx", "ecx", "edx"); -} - void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -8651,16 +8661,6 @@ __init int vmx_hardware_setup(void) return r; } -static void vmx_cleanup_l1d_flush(void) -{ - if (vmx_l1d_flush_pages) { - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); - vmx_l1d_flush_pages = NULL; - } - /* Restore state so sysfs ignores VMX */ - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -} - void vmx_exit(void) { allow_smaller_maxphyaddr = false; From 05bd63959a9d682e5e765f950bb6b356d3d6d27a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 15:37:45 -0800 Subject: [PATCH 128/260] KVM: VMX: Disable L1TF L1 data cache flush if CONFIG_CPU_MITIGATIONS=n Disable support for flushing the L1 data cache to mitigate L1TF if CPU mitigations are disabled for the entire kernel. KVM's mitigation of L1TF is in no way special enough to justify ignoring CONFIG_CPU_MITIGATIONS=n. Deliberately use CPU_MITIGATIONS instead of the more precise MITIGATION_L1TF, as MITIGATION_L1TF only controls the default behavior, i.e. CONFIG_MITIGATION_L1TF=n doesn't completely disable L1TF mitigations in the kernel. Keep the vmentry_l1d_flush module param to avoid breaking existing setups, and leverage the .set path to alert the user to the fact that vmentry_l1d_flush will be ignored. Don't bother validating the incoming value; if an admin misconfigures vmentry_l1d_flush, the fact that the bad configuration won't be detected when running with CONFIG_CPU_MITIGATIONS=n is likely the least of their worries. Reviewed-by: Brendan Jackman Link: https://patch.msgid.link/20251113233746.1703361-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/hardirq.h | 4 +-- arch/x86/kvm/vmx/vmx.c | 56 ++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index f00c09ffe6a9..6b6d472baa0b 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -5,7 +5,7 @@ #include typedef struct { -#if IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif unsigned int __nmi_count; /* arch dependent */ @@ -68,7 +68,7 @@ extern u64 arch_irq_stat(void); DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); #define local_softirq_pending_ref __softirq_pending -#if IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) /* * This function is called from noinstr interrupt contexts * and must be inlined to not get instrumentation. diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4de7ca2bc4b..59d3f2671177 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -203,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO); struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; +#ifdef CONFIG_CPU_MITIGATIONS static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -225,7 +226,7 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; unsigned int i; @@ -302,6 +303,16 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } +static int vmx_setup_l1d_flush(void) +{ + /* + * Hand the parameter mitigation value in which was stored in the pre + * module init parser. If no parameter was given, it will contain + * 'auto' which will be turned into the default 'cond' mitigation mode. + */ + return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); +} + static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { @@ -349,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) } mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); + ret = __vmx_setup_l1d_flush(l1tf); mutex_unlock(&vmx_l1d_flush_mutex); return ret; } @@ -376,6 +387,9 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; + if (!static_branch_unlikely(&vmx_l1d_should_flush)) + return; + /* * This code is only executed when the flush mode is 'cond' or * 'always' @@ -433,6 +447,31 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } +#else /* CONFIG_CPU_MITIGATIONS*/ +static int vmx_setup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; + return 0; +} +static void vmx_cleanup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} +static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + +} +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); + return 0; +} +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sysfs_emit(s, "never\n"); +} +#endif + static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, @@ -7323,8 +7362,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); + vmx_l1d_flush(vcpu); vmx_disable_fb_clear(vmx); @@ -8696,14 +8734,8 @@ int __init vmx_init(void) if (r) return r; - /* - * Must be called after common x86 init so enable_ept is properly set - * up. Hand the parameter mitigation value in which was stored in - * the pre module init parser. If no parameter was given, it will - * contain 'auto' which will be turned into the default 'cond' - * mitigation mode. - */ - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + /* Must be called after common x86 init so enable_ept is setup. */ + r = vmx_setup_l1d_flush(); if (r) goto err_l1d_flush; From 38ee66cb1845dbf1e97c5e5d3db01ae4513f66a9 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 13 Nov 2025 15:37:46 -0800 Subject: [PATCH 129/260] KVM: x86: Unify L1TF flushing under per-CPU variable Currently the tracking of the need to flush L1D for L1TF is tracked by two bits: one per-CPU and one per-vCPU. The per-vCPU bit is always set when the vCPU shows up on a core, so there is no interesting state that's truly per-vCPU. Indeed, this is a requirement, since L1D is a part of the physical CPU. So simplify this by combining the two bits. The vCPU bit was being written from preemption-enabled regions. To play nice with those cases, wrap all calls from KVM and use a raw write so that request a flush with preemption enabled doesn't trigger what would effectively be DEBUG_PREEMPT false positives. Preemption doesn't need to be disabled, as kvm_arch_vcpu_load() will mark the new CPU as needing a flush if the vCPU task is migrated, or if userspace runs the vCPU on a different task. Signed-off-by: Brendan Jackman [sean: put raw write in KVM instead of in a hardirq.h variant] Link: https://patch.msgid.link/20251113233746.1703361-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 20 +++++--------------- arch/x86/kvm/x86.c | 6 +++--- arch/x86/kvm/x86.h | 14 ++++++++++++++ 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 692ec922945f..103af57e1060 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1055,9 +1055,6 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ - bool l1tf_flush_l1d; - /* Host CPU on which VM-entry was most recently attempted */ int last_vmentry_cpu; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5..a81637a98019 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..256b51fc8c10 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3880,7 +3880,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 59d3f2671177..634f591d253e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -395,26 +395,16 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - /* - * Clear the per-vcpu flush bit, it gets set again if the vCPU + * Clear the per-cpu flush bit, it gets set again if the vCPU * is reloaded, i.e. if the vCPU is scheduled out or if KVM * exits to userspace, or if KVM reaches one of the unsafe - * VMEXIT handlers, e.g. if KVM calls into the emulator. + * VMEXIT handlers, e.g. if KVM calls into the emulator, + * or from the interrupt handlers. */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) + if (!kvm_get_cpu_l1tf_flush_l1d()) return; + kvm_clear_cpu_l1tf_flush_l1d(); } vcpu->stat.l1d_flush++; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be737d9645b9..6af37204bd97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5156,7 +5156,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (vcpu->scheduled_out && pmu->version && pmu->event_count) { pmu->need_cleanup = true; @@ -7966,7 +7966,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { /* kvm_write_guest_virt_system can pull in tons of pages. */ - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); @@ -9374,7 +9374,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return handle_emulation_failure(vcpu, emulation_type); } - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f3dc77f006f9..cd67ccbb747f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -420,6 +420,20 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } +static __always_inline void kvm_request_l1tf_flush_l1d(void) +{ +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) + /* + * Use a raw write to set the per-CPU flag, as KVM will ensure a flush + * even if preemption is currently enabled.. If the current vCPU task + * is migrated to a different CPU (or userspace runs the vCPU on a + * different task) before the next VM-Entry, then kvm_arch_vcpu_load() + * will request a flush on the new CPU. + */ + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +#endif +} + void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); From ebd1a336550096bf7543699bab2e89fa401deac6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Nov 2025 14:23:25 -0800 Subject: [PATCH 130/260] KVM: SVM: Handle #MCs in guest outside of fastpath Handle Machine Checks (#MC) that happen in the guest (by forwarding them to the host) outside of KVM's fastpath so that as much host state as possible is re-loaded before invoking the kernel's #MC handler. The only requirement is that KVM invokes the #MC handler before enabling IRQs (and even that could _probably_ be relaxed to handling #MCs before enabling preemption). Waiting to handle #MCs until "more" host state is loaded hardens KVM against flaws in the #MC handler, which has historically been quite brittle. E.g. prior to commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work"), the #MC code could trigger a schedule() with IRQs and preemption disabled. That led to a KVM hack-a-fix in commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Note, except for #MCs on VM-Enter, VMX already handles #MCs outside of the fastpath. Reviewed-by: Rick Edgecombe Reviewed-by: Jon Kohler Link: https://patch.msgid.link/20251118222328.2265758-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 59d896322855..ff6ddba16a26 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4325,14 +4325,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(vcpu); - trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); @@ -4621,8 +4613,16 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + switch (to_svm(vcpu)->vmcb->control.exit_code) { + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + svm_handle_mce(vcpu); + break; + case SVM_EXIT_INTR: vcpu->arch.at_instruction_boundary = true; + break; + default: + break; + } } static void svm_setup_mce(struct kvm_vcpu *vcpu) From 63669bd1d50f0b5cdb7bb390a0955b7b26821152 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Nov 2025 14:23:26 -0800 Subject: [PATCH 131/260] KVM: VMX: Handle #MCs on VM-Enter/TD-Enter outside of the fastpath Handle Machine Checks (#MC) that happen on VM-Enter (VMX or TDX) outside of KVM's fastpath so that as much host state as possible is re-loaded before invoking the kernel's #MC handler. The only requirement is that KVM invokes the #MC handler before enabling IRQs (and even that could _probably_ be related to handling #MCs before enabling preemption). Waiting to handle #MCs until "more" host state is loaded hardens KVM against flaws in the #MC handler, which has historically been quite brittle. E.g. prior to commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work"), the #MC code could trigger a schedule() with IRQs and preemption disabled. That led to a KVM hack-a-fix in commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Note, vmx_handle_exit_irqoff() is common to VMX and TDX guests. Cc: Tony Lindgren Cc: Rick Edgecombe Cc: Jon Kohler Reviewed-by: Tony Lindgren Link: https://patch.msgid.link/20251118222328.2265758-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 3 --- arch/x86/kvm/vmx/vmx.c | 16 +++++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 163f854a39f2..6d41d2fc8043 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1063,9 +1063,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) return EXIT_FASTPATH_NONE; - if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(tdx_failed_vmentry(vcpu))) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 634f591d253e..788c9e81f2ba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7074,10 +7074,19 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (to_vt(vcpu)->emulation_required) return; - if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) + switch (vmx_get_exit_reason(vcpu).basic) { + case EXIT_REASON_EXTERNAL_INTERRUPT: handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) + break; + case EXIT_REASON_EXCEPTION_NMI: handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); + break; + case EXIT_REASON_MCE_DURING_VMENTRY: + kvm_machine_check(); + break; + default: + break; + } } /* @@ -7526,9 +7535,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) From 75c69c82f21176ef6780f0b82de1019f656946e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Nov 2025 14:23:27 -0800 Subject: [PATCH 132/260] KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop Move KVM's swapping of XFEATURE masks, i.e. XCR0 and XSS, out of the fastpath loop now that the guts of the #MC handler runs in task context, i.e. won't invoke schedule() with preemption disabled and clobber state (or crash the kernel) due to trying to context switch XSTATE with a mix of host and guest state. For all intents and purposes, this reverts commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"), which papered over an egregious bug/flaw in the #MC handler where it would do schedule() even though IRQs are disabled. E.g. the call stack from the commit: kvm_load_guest_xcr0 ... kvm_x86_ops->run(vcpu) vmx_vcpu_run vmx_complete_atomic_exit kvm_machine_check do_machine_check do_memory_failure memory_failure lock_page Commit 1811d979c716 "fixed" the immediate issue of XRSTORS exploding, but completely ignored that scheduling out a vCPU task while IRQs and preemption is wildly broken. Thankfully, commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work") (somewhat incidentally?) fixed that flaw by pushing the meat of the work to the user-return path, i.e. to task context. KVM has also hardened itself against #MC goofs by moving #MC forwarding to kvm_x86_ops.handle_exit_irqoff(), i.e. out of the fastpath. While that's by no means a robust fix, restoring as much state as possible before handling the #MC will hopefully provide some measure of protection in the event that #MC handling goes off the rails again. Note, KVM always intercepts XCR0 writes for vCPUs without protected state, e.g. there's no risk of consuming a stale XCR0 when determining if a PKRU update is needed; kvm_load_host_xfeatures() only reads, and never writes, vcpu->arch.xcr0. Deferring the XCR0 and XSS loads shaves ~300 cycles off the fastpath for Intel, and ~500 cycles for AMD. E.g. using INVD in KVM-Unit-Test's vmexit.c, which an extra hack to enable CR4.OXSAVE, latency numbers for AMD Turin go from ~2000 => 1500, and for Intel Emerald Rapids, go from ~1300 => ~1000. Cc: Jon Kohler Reviewed-by: Rick Edgecombe Reviewed-by: Jon Kohler Link: https://patch.msgid.link/20251118222328.2265758-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6af37204bd97..f6af75bff9ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1205,13 +1205,12 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); @@ -1219,6 +1218,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) vcpu->arch.ia32_xss != kvm_host.xss) wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } +} + +static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; + + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, kvm_host.xss); + } +} + +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && @@ -1240,17 +1260,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.pkru != vcpu->arch.host_pkru) wrpkru(vcpu->arch.host_pkru); } - - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, kvm_host.xss); - } - } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); @@ -11264,6 +11273,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + kvm_load_guest_xfeatures(vcpu); + if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(DR7_FIXED_1, 7); @@ -11350,6 +11361,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_load_host_xfeatures(vcpu); + /* * Sync xfd before calling handle_exit_irqoff() which may * rely on the fact that guest_fpu::xfd is up-to-date (e.g. From 7649412af3eab700037feb14822d03eba484c576 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Nov 2025 14:23:28 -0800 Subject: [PATCH 133/260] KVM: x86: Load guest/host PKRU outside of the fastpath run loop Move KVM's swapping of PKRU outside of the fastpath loop, as there is no KVM code anywhere in the fastpath that accesses guest/userspace memory, i.e. that can consume protection keys. As documented by commit 1be0e61c1f25 ("KVM, pkeys: save/restore PKRU when guest/host switches"), KVM just needs to ensure the host's PKRU is loaded when KVM (or the kernel at-large) may access userspace memory. And at the time of commit 1be0e61c1f25, KVM didn't have a fastpath, and PKU was strictly contained to VMX, i.e. there was no reason to swap PKRU outside of vmx_vcpu_run(). Over time, the "need" to swap PKRU close to VM-Enter was likely falsely solidified by the association with XFEATUREs in commit 37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to x86.c"), and XFEATURE swapping was in turn moved close to VM-Enter/VM-Exit as a KVM hack-a-fix ution for an #MC handler bug by commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Deferring the PKRU loads shaves ~40 cycles off the fastpath for Intel, and ~60 cycles for AMD. E.g. using INVD in KVM-Unit-Test's vmexit.c, with extra hacks to enable CR4.PKE and PKRU=(-1u & ~0x3), latency numbers for AMD Turin go from ~1560 => ~1500, and for Intel Emerald Rapids, go from ~810 => ~770. Reviewed-by: Rick Edgecombe Reviewed-by: Jon Kohler Link: https://patch.msgid.link/20251118222328.2265758-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx.c | 4 ---- arch/x86/kvm/x86.c | 14 ++++++++++---- arch/x86/kvm/x86.h | 2 -- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ff6ddba16a26..c5b588d621c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4250,7 +4250,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); - kvm_load_guest_xsave_state(vcpu); /* * Hardware only context switches DEBUGCTL if LBR virtualization is @@ -4293,7 +4292,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(vcpu->arch.host_debugctl); - kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 788c9e81f2ba..839b92b3d0db 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7473,8 +7473,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xsave_state(vcpu); - pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); @@ -7518,8 +7516,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) pt_guest_exit(vmx); - kvm_load_host_xsave_state(vcpu); - if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6af75bff9ad..1ef77a1be9b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1235,7 +1235,7 @@ static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu) } } -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1246,9 +1246,8 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1261,7 +1260,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) wrpkru(vcpu->arch.host_pkru); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -11303,6 +11301,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) guest_timing_enter_irqoff(); + /* + * Swap PKRU with hardware breakpoints disabled to minimize the number + * of flows where non-KVM code can run with guest state loaded. + */ + kvm_load_guest_pkru(vcpu); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -11331,6 +11335,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; } + kvm_load_host_pkru(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cd67ccbb747f..fdab0ad49098 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -636,8 +636,6 @@ static inline void kvm_machine_check(void) #endif } -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); From 0b28f21ad46200fec9f8b8058692902d0de98221 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Mon, 10 Nov 2025 13:05:39 +0800 Subject: [PATCH 134/260] KVM: x86: Add a helper to dedup loading guest/host XCR0 and XSS Add and use a helper, kvm_load_xfeatures(), to dedup the code that loads guest/host xfeatures. Opportunistically return early if X86_CR4_OSXSAVE is not set to reduce indentations. No functional change intended. Suggested-by: Chao Gao Reviewed-by: Chao Gao Signed-off-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20251110050539.3398759-1-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ef77a1be9b2..aff32603a043 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1205,34 +1205,21 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu) +static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) { if (vcpu->arch.guest_state_protected) return; - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); - } -} - -static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_state_protected) + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) return; - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, kvm_host.xss); - } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); } static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) @@ -11271,7 +11258,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - kvm_load_guest_xfeatures(vcpu); + kvm_load_xfeatures(vcpu, true); if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { @@ -11367,7 +11354,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_load_host_xfeatures(vcpu); + kvm_load_xfeatures(vcpu, false); /* * Sync xfd before calling handle_exit_irqoff() which may From 297877069bc2fa079fb2a60ae91ca9abb481074a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:21 -0800 Subject: [PATCH 135/260] KVM: arm64: Drop useless __GFP_HIGHMEM from kvm struct allocation A recent change on the receiving end of vmalloc() started warning about unsupported GFP flags passed by the caller. Nathan reports that this warning fires in kvm_arch_alloc_vm(), owing to the fact that KVM is passing a meaningless __GFP_HIGHMEM. Do as the warning says and fix the code. Cc: Vishal Moola (Oracle) Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/kvmarm/20251118224448.GA998046@ax162/ Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-2-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..e791fa52f874 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From cb17d79ff51d41f656bcf7928330b2e9c0003583 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:22 -0800 Subject: [PATCH 136/260] KVM: arm64: Use kvzalloc() for kvm struct allocation Physically-allocated KVM structs aren't necessary when in VHE mode as there's no need to share with the hyp's address space. Of course, there can still be a performance benefit from physical allocations. Use kvzalloc() for opportunistic physical allocations. Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-3-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e791fa52f874..ecbe2c8dc00c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From 31df012da496968d8d4368f693ad45dfcbfba40b Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:43 +0100 Subject: [PATCH 137/260] KVM: selftests: Assert GICR_TYPER.Processor_Number matches selftest CPU number The selftests GIC library and tests assume that the GICR_TYPER.Processor_number associated with a given CPU is the same as the CPU's selftest index. Since this assumption is not guaranteed by specification, add an assert in gicv3_cpu_init() that validates this is true. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-1-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..f81025cd32e2 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -298,12 +298,17 @@ static void gicv3_cpu_init(unsigned int cpu) volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* From 85f329df293119d6ba1a26453d109556631081a4 Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:44 +0100 Subject: [PATCH 138/260] KVM: selftests: SYNC after guest ITS setup in vgic_lpi_stress vgic_lpi_stress sends MAPTI and MAPC commands during guest GIC setup to map interrupt events to ITT entries and collection IDs to redistributors, respectively. We have no guarantee that the ITS will finish handling these mapping commands before the selftest calls KVM_SIGNAL_MSI to inject LPIs to the guest. If LPIs are injected before ITS mapping completes, the ITS cannot properly pass the interrupt on to the redistributor. Fix by adding a SYNC command to the selftests ITS library, then calling SYNC after ITS mapping to ensure mapping completes before signal_lpi() writes to GITS_TRANSLATER. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-2-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c | 4 ++++ tools/testing/selftests/kvm/include/arm64/gic_v3_its.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..aec1b69a4de3 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -246,3 +246,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} From 156f70afcfecfc45be5fdc2e4adebc5ea70a93b0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 14:11:50 -0800 Subject: [PATCH 139/260] KVM: arm64: Only drop references on empty tables in stage2_free_walker A subsequent change to the way KVM frees stage-2s will invoke the free walker on sub-ranges of the VM's IPA space, meaning there's potential for only partially visiting a table's PTEs. Split the leaf and table visitors and only drop references on a table when the page count reaches 1, implying there are no valid PTEs that need to be visited. Invalidate the table PTE to avoid traversing the stale reference. Link: https://msgid.link/20251113052452.975081-2-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 42 +++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..6d6a23f7dedb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1535,20 +1535,46 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ + mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} + +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ if (!stage2_pte_is_counted(ctx->old)) return 0; - mm_ops->put_page(ctx->ptep); - - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); - - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) From d68d66e57e2be9541fa67bafabdfe8826bb73799 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:51 +0000 Subject: [PATCH 140/260] KVM: arm64: Split kvm_pgtable_stage2_destroy() Split kvm_pgtable_stage2_destroy() into two: - kvm_pgtable_stage2_destroy_range(), that performs the page-table walk and free the entries over a range of addresses. - kvm_pgtable_stage2_destroy_pgd(), that frees the PGD. This refactoring enables subsequent patches to free large page-tables in chunks, calling cond_resched() between each chunk, to yield the CPU as necessary. Existing callers of kvm_pgtable_stage2_destroy(), that probably cannot take advantage of this (such as nVMHE), will continue to function as is. Signed-off-by: Raghavendra Rao Ananta Suggested-by: Oliver Upton Link: https://msgid.link/20251113052452.975081-3-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 30 ++++++++++++++++++++++++++++ arch/arm64/include/asm/kvm_pkvm.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 25 +++++++++++++++++++---- arch/arm64/kvm/mmu.c | 12 +++++++++-- arch/arm64/kvm/pkvm.c | 11 ++++++++-- 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..1246216616b5 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -355,6 +355,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return pteref; +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -384,6 +389,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return rcu_dereference_raw(pteref); +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -551,6 +561,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +/** + * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * + * It is assumed that the rest of the page-table is freed before this operation. + */ +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); + /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 08be89c95466..0aecd4ac5f45 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -180,7 +180,9 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 6d6a23f7dedb..0882896dbf8f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1577,21 +1577,38 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..c2bc1eba032c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,6 +904,14 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -980,7 +988,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1081,7 +1089,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); kfree(pgt); } } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 24f0f8a8c943..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, From 4ddfab5436b6918ce4e28f610e670e040a304152 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:52 +0000 Subject: [PATCH 141/260] KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables When a large VM, specifically one that holds a significant number of PTEs, gets abruptly destroyed, the following warning is seen during the page-table walk: sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE Tainted: [O]=OOT_MODULE Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x3c/0xb8 dump_stack+0x18/0x30 resched_latency_warn+0x7c/0x88 sched_tick+0x1c4/0x268 update_process_times+0xa8/0xd8 tick_nohz_handler+0xc8/0x168 __hrtimer_run_queues+0x11c/0x338 hrtimer_interrupt+0x104/0x308 arch_timer_handler_phys+0x40/0x58 handle_percpu_devid_irq+0x8c/0x1b0 generic_handle_domain_irq+0x48/0x78 gic_handle_irq+0x1b8/0x408 call_on_irq_stack+0x24/0x30 do_interrupt_handler+0x54/0x78 el1_interrupt+0x44/0x88 el1h_64_irq_handler+0x18/0x28 el1h_64_irq+0x84/0x88 stage2_free_walker+0x30/0xa0 (P) __kvm_pgtable_walk+0x11c/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 kvm_pgtable_walk+0xc4/0x140 kvm_pgtable_stage2_destroy+0x5c/0xf0 kvm_free_stage2_pgd+0x6c/0xe8 kvm_uninit_stage2_mmu+0x24/0x48 kvm_arch_flush_shadow_all+0x80/0xa0 kvm_mmu_notifier_release+0x38/0x78 __mmu_notifier_release+0x15c/0x250 exit_mmap+0x68/0x400 __mmput+0x38/0x1c8 mmput+0x30/0x68 exit_mm+0xd4/0x198 do_exit+0x1a4/0xb00 do_group_exit+0x8c/0x120 get_signal+0x6d4/0x778 do_signal+0x90/0x718 do_notify_resume+0x70/0x170 el0_svc+0x74/0xd8 el0t_64_sync_handler+0x60/0xc8 el0t_64_sync+0x1b0/0x1b8 The warning is seen majorly on the host kernels that are configured not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this, instead of walking the entire page-table in one go, split it into smaller ranges, by checking for cond_resched() between each range. Since the path is executed during VM destruction, after the page-table structure is unlinked from the KVM MMU, relying on cond_resched_rwlock_write() isn't necessary. Signed-off-by: Raghavendra Rao Ananta Link: https://msgid.link/20251113052452.975081-4-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c2bc1eba032c..f86d17ad50a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,11 +904,35 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + static void kvm_stage2_destroy(struct kvm_pgtable *pgt) { unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + stage2_destroy_range(pgt, 0, BIT(ia_bits)); KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); } From c57d9bafbd0b89709c1bbbda346cf81dea5a3224 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:24 -0500 Subject: [PATCH 142/260] KVM: x86: Add support for emulating MOVNTDQA MOVNTDQA is a simple MOV instruction, in fact it has the same characteristics as 0F E7 (MOVNTDQ) other than the aligned-address requirement. Signed-off-by: Paolo Bonzini Link: https://patch.msgid.link/20251114003633.60689-2-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e3da5b497b8..43ae4fcb2137 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4133,7 +4133,7 @@ static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; -static const struct gprefix pfx_0f_e7 = { +static const struct gprefix pfx_0f_e7_0f_38_2a = { N, I(Sse, em_mov), N, N, }; @@ -4431,7 +4431,7 @@ static const struct opcode twobyte_table[256] = { /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N @@ -4458,8 +4458,13 @@ static const struct gprefix three_byte_0f_38_f1 = { * byte. */ static const struct opcode opcode_map_0f_38[256] = { - /* 0x00 - 0x7f */ - X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x00 - 0x1f */ + X16(N), X16(N), + /* 0x20 - 0x2f */ + X8(N), + X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, + /* 0x30 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0xf0 - 0xf1 */ From 3f3fc58df502567fb26703276691299c2b712996 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:25 -0500 Subject: [PATCH 143/260] KVM: x86: Move Src2Shift up one bit (use bits 36:32 for Src2 in the emulator) An irresistible microoptimization (changing accesses to Src2 to just an AND :)) that also frees a bit for AVX in the low flags word. This makes it closer to SSE since both of them can access XMM registers, pointlessly shaving another clock cycle or two (maybe). No functional change intended. Signed-off-by: Paolo Bonzini Reviewed-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 43ae4fcb2137..57799b5d9da2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -147,7 +147,7 @@ #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ /* Source 2 operand type */ -#define Src2Shift (31) +#define Src2Shift (32) /* bits 32-36 */ #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) @@ -161,6 +161,7 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +/* free: 37-39 */ #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ #define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ From 3d8834a0d1c984c7bda3d8a4d618026d8b3bcbd3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:26 -0500 Subject: [PATCH 144/260] KVM: x86: Improve formatting of the emulator's flags table Align a little better the comments on the right side and list explicitly the bits used by multi-bit fields. No functional change intended. Signed-off-by: Paolo Bonzini Reviewed-by: Chang S. Bae Link: https://patch.msgid.link/20251114003633.60689-4-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 57799b5d9da2..70d0f8561097 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -81,9 +81,8 @@ */ /* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define DstShift 1 +#define ByteOp (1<<0) /* 8-bit operands. */ +#define DstShift 1 /* Destination operand type at bits 1-5 */ #define ImplicitOps (OpImplicit << DstShift) #define DstReg (OpReg << DstShift) #define DstMem (OpMem << DstShift) @@ -95,8 +94,7 @@ #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) -/* Source operand type. */ -#define SrcShift 6 +#define SrcShift 6 /* Source operand type at bits 6-10 */ #define SrcNone (OpNone << SrcShift) #define SrcReg (OpReg << SrcShift) #define SrcMem (OpMem << SrcShift) @@ -119,10 +117,10 @@ #define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) -#define MemAbs (1<<12) /* Memory operand is absolute displacement */ +#define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ #define Stack (1<<14) /* Stack instruction (push/pop) */ -#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ +#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */ #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ @@ -131,11 +129,8 @@ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ -/* Generic ModRM decode. */ -#define ModRM (1<<19) -/* Destination is only written; never read. */ -#define Mov (1<<20) -/* Misc flags */ +#define ModRM (1<<19) /* Generic ModRM decode. */ +#define Mov (1<<20) /* Destination is only written; never read. */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ @@ -143,11 +138,10 @@ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ -#define No64 (1<<28) +#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ -/* Source 2 operand type */ -#define Src2Shift (32) /* bits 32-36 */ +#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) @@ -163,11 +157,12 @@ #define Src2Mask (OpMask << Src2Shift) /* free: 37-39 */ #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ -#define AlignMask ((u64)7 << 41) +#define AlignMask ((u64)7 << 41) /* Memory alignment requirement at bits 41-43 */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ #define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ +/* free: 44 */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ From 1a84b07acaa45bb2e4a1d09be26b0ba6270929c3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:27 -0500 Subject: [PATCH 145/260] KVM: x86: Move op_prefix to struct x86_emulate_ctxt (from x86_decode_insn()) VEX decode will need to set it based on the "pp" bits, so make it a field in the struct rather than a local variable. No functional change intended. Signed-off-by: Paolo Bonzini Reviewed-by: Chang S. Bae Link: https://patch.msgid.link/20251114003633.60689-5-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 8 ++++---- arch/x86/kvm/kvm_emulate.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 70d0f8561097..23019928734a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4761,7 +4761,6 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; - bool op_prefix = false; bool has_seg_override = false; struct opcode opcode; u16 dummy; @@ -4813,7 +4812,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int for (;;) { switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ - op_prefix = true; + ctxt->op_prefix = true; /* switch between 2/4 bytes */ ctxt->op_bytes = def_op_bytes ^ 6; break; @@ -4920,9 +4919,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int opcode = opcode.u.group[goffset]; break; case Prefix: - if (ctxt->rep_prefix && op_prefix) + if (ctxt->rep_prefix && ctxt->op_prefix) return EMULATION_FAILED; - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; + simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; case 0x66: opcode = opcode.u.gprefix->pfx_66; break; @@ -5140,6 +5139,7 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) ctxt->rip_relative = false; ctxt->rex_prefix = 0; ctxt->lock_prefix = 0; + ctxt->op_prefix = false; ctxt->rep_prefix = 0; ctxt->regs_valid = 0; ctxt->regs_dirty = 0; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 7b5ddb787a25..83af019620e3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -348,6 +348,7 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; + bool op_prefix; u8 op_bytes; u8 ad_bytes; union { From 7e11eec989c840ff8cd11cee5005c7ce68ef1ab3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:28 -0500 Subject: [PATCH 146/260] KVM: x86: Share emulator's common register decoding code Remove all duplicate handling of register operands, including picking the right register class and fetching it, by extracting a new function that can be used for both REG and MODRM operands. Centralize setting op->orig_val = op->val in fetch_register_operand() as well. No functional change intended. Signed-off-by: Paolo Bonzini Reviewed-by: Chang S. Bae Link: https://patch.msgid.link/20251114003633.60689-6-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 49 +++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 23019928734a..8a755ef30c18 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1026,6 +1026,7 @@ static void fetch_register_operand(struct operand *op) op->val = *(u64 *)op->addr.reg; break; } + op->orig_val = op->val; } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1071,16 +1072,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) +static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, int reg) { - unsigned int reg; - - if (ctxt->d & ModRM) - reg = ctxt->modrm_reg; - else - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); - if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -1099,9 +1093,20 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); - fetch_register_operand(op); - op->orig_val = op->val; +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + unsigned int reg; + + if (ctxt->d & ModRM) + reg = ctxt->modrm_reg; + else + reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); + + __decode_register_operand(ctxt, op, reg); } static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) @@ -1128,24 +1133,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { - op->type = OP_REG; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, - ctxt->d & ByteOp); - if (ctxt->d & Sse) { - op->type = OP_XMM; - op->bytes = 16; - op->addr.xmm = ctxt->modrm_rm; - kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); - return rc; - } - if (ctxt->d & Mmx) { - op->type = OP_MM; - op->bytes = 8; - op->addr.mm = ctxt->modrm_rm & 7; - return rc; - } - fetch_register_operand(op); + __decode_register_operand(ctxt, op, ctxt->modrm_rm); return rc; } @@ -4619,14 +4607,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpAccLo: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpAccHi: if (ctxt->d & ByteOp) { @@ -4637,7 +4623,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpDI: op->type = OP_MEM; From f106797f81d633010e6795e34871a31883c9791f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:29 -0500 Subject: [PATCH 147/260] KVM: x86: Add x86_emulate_ops.get_xcr() callback This will be necessary in order to check whether AVX is enabled. Signed-off-by: Paolo Bonzini Reviewed-by: Chang S. Bae Link: https://patch.msgid.link/20251114003633.60689-7-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 83af019620e3..5f9d69c64cd5 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -237,6 +237,7 @@ struct x86_emulate_ops { bool (*is_smm)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); + int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aff32603a043..35e7ca67afc9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8804,6 +8804,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } +static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) +{ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; + return 0; +} + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) { return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); @@ -8876,6 +8884,7 @@ static const struct x86_emulate_ops emulate_ops = { .is_smm = emulator_is_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, + .get_xcr = emulator_get_xcr, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, .is_canonical_addr = emulator_is_canonical_addr, From 4cb21be4c3b0cb6248da1dcc4f19f05d3443a235 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:30 -0500 Subject: [PATCH 148/260] KVM: x86: Add AVX support to the emulator's register fetch and writeback Prepare struct operand for hosting AVX registers. Remove the existing, incomplete code that placed the Avx flag in the operand alignment field, and repurpose the name for a separate bit that indicates: - after decode, whether an instruction supports the VEX prefix; - before writeback, that the instruction did have the VEX prefix and therefore 1) it can have op_bytes == 32; 2) t should clear high bytes of XMM registers. Right now the bit will never be set and the patch has no intended functional change. However, this is actually more vexing than the decoder changes itself, and therefore worth separating. Co-developed-by: Keith Busch Signed-off-by: Keith Busch Signed-off-by: Paolo Bonzini Link: https://patch.msgid.link/20251114003633.60689-8-pbonzini@redhat.com [sean: guard ymm[8-15] accesses with #ifdef CONFIG_X86_64] Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 58 +++++++++++++++++++++++++-------- arch/x86/kvm/fpu.h | 66 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/kvm_emulate.h | 7 ++-- 3 files changed, 114 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a755ef30c18..531dac5cf3dc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -141,6 +141,7 @@ #define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ +#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */ #define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) @@ -157,12 +158,11 @@ #define Src2Mask (OpMask << Src2Shift) /* free: 37-39 */ #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ -#define AlignMask ((u64)7 << 41) /* Memory alignment requirement at bits 41-43 */ +#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ -#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ -/* free: 44 */ +#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ +/* free: 43-44 */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ @@ -618,7 +618,6 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) switch (alignment) { case Unaligned: - case Avx: return 1; case Aligned16: return 16; @@ -1075,7 +1074,14 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, int reg) { - if (ctxt->d & Sse) { + if ((ctxt->d & Avx) && ctxt->op_bytes == 32) { + op->type = OP_YMM; + op->bytes = 32; + op->addr.xmm = reg; + kvm_read_avx_reg(reg, &op->vec_val2); + return; + } + if (ctxt->d & (Avx|Sse)) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; @@ -1767,7 +1773,15 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->data, op->bytes * op->count); case OP_XMM: - kvm_write_sse_reg(op->addr.xmm, &op->vec_val); + if (!(ctxt->d & Avx)) { + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); + break; + } + /* full YMM write but with high bytes cleared */ + memset(op->valptr + 16, 0, 16); + fallthrough; + case OP_YMM: + kvm_write_avx_reg(op->addr.xmm, &op->vec_val2); break; case OP_MM: kvm_write_mmx_reg(op->addr.mm, &op->mm_val); @@ -4861,9 +4875,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ - opcode = opcode_table[ctxt->b]; - /* Two-byte opcode? */ if (ctxt->b == 0x0f) { + /* Two- or three-byte opcode */ ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; @@ -4874,6 +4887,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->b = insn_fetch(u8, ctxt); opcode = opcode_map_0f_38[ctxt->b]; } + } else { + /* Opcode byte(s). */ + opcode = opcode_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -5022,7 +5038,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->op_bytes = 4; if (ctxt->d & Sse) - ctxt->op_bytes = 16; + ctxt->op_bytes = 16, ctxt->d &= ~Avx; else if (ctxt->d & Mmx) ctxt->op_bytes = 8; } @@ -5154,20 +5170,34 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) } if (unlikely(ctxt->d & - (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if (ctxt->d & Avx) { + u64 xcr = 0; + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE) + || ops->get_xcr(ctxt, 0, &xcr) + || !(xcr & XFEATURE_MASK_YMM)) { + rc = emulate_ud(ctxt); + goto done; + } + } else if (ctxt->d & Sse) { + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) { + rc = emulate_ud(ctxt); + goto done; + } + } + + if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h index 3ba12888bf66..f898781b6a06 100644 --- a/arch/x86/kvm/fpu.h +++ b/arch/x86/kvm/fpu.h @@ -15,6 +15,58 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) +typedef u32 __attribute__((vector_size(32))) avx256_t; + +static inline void _kvm_read_avx_reg(int reg, avx256_t *data) +{ + switch (reg) { + case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break; + case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break; + case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break; + case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break; + case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break; + case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break; + case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break; + case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break; + case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break; + case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break; + case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break; + case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break; + case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break; + case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break; + case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_write_avx_reg(int reg, const avx256_t *data) +{ + switch (reg) { + case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break; + case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break; + case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break; + case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break; + case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break; + case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break; + case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break; + case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break; + case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break; + case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break; + case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break; + case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break; + case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break; + case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break; + case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break; +#endif + default: BUG(); + } +} + static inline void _kvm_read_sse_reg(int reg, sse128_t *data) { switch (reg) { @@ -109,6 +161,20 @@ static inline void kvm_fpu_put(void) fpregs_unlock(); } +static inline void kvm_read_avx_reg(int reg, avx256_t *data) +{ + kvm_fpu_get(); + _kvm_read_avx_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_avx_reg(int reg, const avx256_t *data) +{ + kvm_fpu_get(); + _kvm_write_avx_reg(reg, data); + kvm_fpu_put(); +} + static inline void kvm_read_sse_reg(int reg, sse128_t *data) { kvm_fpu_get(); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 5f9d69c64cd5..c526f46f5595 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -249,7 +249,7 @@ struct x86_emulate_ops { /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type; unsigned int bytes; unsigned int count; union { @@ -268,11 +268,12 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(sse128_t)]; + char valptr[sizeof(avx256_t)]; sse128_t vec_val; + avx256_t vec_val2; u64 mm_val; void *data; - }; + } __aligned(32); }; #define X86_MAX_INSTRUCTION_LENGTH 15 From 825f0aece084ecde02805083d44d08ab189a9249 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 13 Nov 2025 19:36:31 -0500 Subject: [PATCH 149/260] KVM: x86: Refactor REX prefix handling in instruction emulation Restructure how to represent and interpret REX fields, preparing for handling of both REX2 and VEX. REX uses the upper four bits of a single byte as a fixed identifier, and the lower four bits containing the data. VEX and REX2 extends this so that the first byte identifies the prefix and the rest encode additional bits; and while VEX only has the same four data bits as REX, eight zero bits are a valid value for the data bits of REX2. So, stop storing the REX byte as-is. Instead, store only the low bits of the REX prefix and track separately whether a REX-like prefix was used. No functional changes intended. Signed-off-by: Chang S. Bae Message-ID: <20251110180131.28264-11-chang.seok.bae@intel.com> [Extracted from APX series; removed bitfields and REX2-specific default. - Paolo] Signed-off-by: Paolo Bonzini Link: https://patch.msgid.link/20251114003633.60689-9-pbonzini@redhat.com [sean: name REX_{BXRW} enum "rex_bits"] Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 33 +++++++++++++++++++++------------ arch/x86/kvm/kvm_emulate.h | 11 ++++++++++- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 531dac5cf3dc..b636bca92ca4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -239,6 +239,13 @@ enum x86_transfer_type { X86_TRANSFER_TASK_SWITCH, }; +enum rex_bits { + REX_B = 1, + REX_X = 2, + REX_R = 4, + REX_W = 8, +}; + static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; @@ -919,7 +926,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int byteop) { void *p; - int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; + int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1110,7 +1117,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, if (ctxt->d & ModRM) reg = ctxt->modrm_reg; else - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); + reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0); __decode_register_operand(ctxt, op, reg); } @@ -1129,9 +1136,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ + ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0); + index_reg = (ctxt->rex_bits & REX_X ? 8 : 0); + base_reg = (ctxt->rex_bits & REX_B ? 8 : 0); ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; @@ -2464,7 +2471,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(&cs, &ss); - if ((ctxt->rex_prefix & 0x8) != 0x0) + if (ctxt->rex_bits & REX_W) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; @@ -4850,7 +4857,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; - ctxt->rex_prefix = ctxt->b; + ctxt->rex_prefix = REX_PREFIX; + ctxt->rex_bits = ctxt->b & 0xf; continue; case 0xf0: /* LOCK */ ctxt->lock_prefix = 1; @@ -4864,15 +4872,15 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int } /* Any legacy prefix after a REX prefix nullifies its effect. */ - - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex_bits = 0; } done_prefixes: /* REX prefix. */ - if (ctxt->rex_prefix & 8) - ctxt->op_bytes = 8; /* REX.W */ + if (ctxt->rex_bits & REX_W) + ctxt->op_bytes = 8; /* Opcode byte(s). */ if (ctxt->b == 0x0f) { @@ -5138,7 +5146,8 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) { /* Clear fields that are set conditionally but read without a guard. */ ctxt->rip_relative = false; - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex_bits = 0; ctxt->lock_prefix = 0; ctxt->op_prefix = false; ctxt->rep_prefix = 0; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index c526f46f5595..fb3dab4b5a53 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -319,6 +319,14 @@ typedef void (*fastop_t)(struct fastop *); #define NR_EMULATOR_GPRS 8 #endif +/* + * Distinguish between no prefix, REX, or in the future REX2. + */ +enum rex_type { + REX_NONE, + REX_PREFIX, +}; + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -360,7 +368,8 @@ struct x86_emulate_ctxt { int (*check_perm)(struct x86_emulate_ctxt *ctxt); bool rip_relative; - u8 rex_prefix; + enum rex_type rex_prefix; + u8 rex_bits; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ From f0585a714a7531afaa23b3351a316f61ccaa7b00 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:32 -0500 Subject: [PATCH 150/260] KVM: x86: Add emulator support for decoding VEX prefixes After all the changes done in the previous patches, the only thing left to support AVX MOV instructions is to expand the VEX prefix into the appropriate REX, 66/F3/F2 and map prefixes. Three-operand instructions are not supported. The Avx bit in this case is not cleared, in fact it is used as the sign that the instruction does support VEX encoding. Until it is added to any instruction, however, the only functional change is to change some not-implemented instructions to #UD if they correspond to a VEX prefix with an invalid map. Co-developed-by: Keith Busch Signed-off-by: Keith Busch Signed-off-by: Paolo Bonzini Link: https://patch.msgid.link/20251114003633.60689-10-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 122 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b636bca92ca4..63b83a2e0e87 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3963,6 +3963,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static const struct opcode ud = I(SrcNone, emulate_ud); + static const struct opcode group7_rm0[] = { N, I(SrcNone | Priv | EmulateOnUD, em_hypercall), @@ -4762,11 +4764,87 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, return rc; } +static int x86_decode_avx(struct x86_emulate_ctxt *ctxt, + u8 vex_1st, u8 vex_2nd, struct opcode *opcode) +{ + u8 vex_3rd, map, pp, l, v; + int rc = X86EMUL_CONTINUE; + + if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix) + goto ud; + + if (vex_1st == 0xc5) { + /* Expand RVVVVlpp to VEX3 format */ + vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */ + vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */ + } else { + vex_3rd = insn_fetch(u8, ctxt); + } + + /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */ + vex_2nd ^= 0xE0; /* binary 11100000 */ + vex_3rd ^= 0x78; /* binary 01111000 */ + + ctxt->rex_prefix = REX_PREFIX; + ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */ + ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */ + if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64) + goto ud; + + map = vex_2nd & 0x1f; + v = (vex_3rd >> 3) & 0xf; + l = vex_3rd & 0x4; + pp = vex_3rd & 0x3; + + ctxt->b = insn_fetch(u8, ctxt); + switch (map) { + case 1: + ctxt->opcode_len = 2; + *opcode = twobyte_table[ctxt->b]; + break; + case 2: + ctxt->opcode_len = 3; + *opcode = opcode_map_0f_38[ctxt->b]; + break; + case 3: + /* no 0f 3a instructions are supported yet */ + return X86EMUL_UNHANDLEABLE; + default: + goto ud; + } + + /* + * No three operand instructions are supported yet; those that + * *are* marked with the Avx flag reserve the VVVV flag. + */ + if (v) + goto ud; + + if (l) + ctxt->op_bytes = 32; + else + ctxt->op_bytes = 16; + + switch (pp) { + case 0: break; + case 1: ctxt->op_prefix = true; break; + case 2: ctxt->rep_prefix = 0xf3; break; + case 3: ctxt->rep_prefix = 0xf2; break; + } + +done: + return rc; +ud: + *opcode = ud; + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; + bool vex_prefix = false; bool has_seg_override = false; struct opcode opcode; u16 dummy; @@ -4883,7 +4961,21 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->op_bytes = 8; /* Opcode byte(s). */ - if (ctxt->b == 0x0f) { + if (ctxt->b == 0xc4 || ctxt->b == 0xc5) { + /* VEX or LDS/LES */ + u8 vex_2nd = insn_fetch(u8, ctxt); + if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) { + opcode = opcode_table[ctxt->b]; + ctxt->modrm = vex_2nd; + /* the Mod/RM byte has been fetched already! */ + goto done_modrm; + } + + vex_prefix = true; + rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode); + if (rc != X86EMUL_CONTINUE) + goto done; + } else if (ctxt->b == 0x0f) { /* Two- or three-byte opcode */ ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); @@ -4899,17 +4991,12 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int /* Opcode byte(s). */ opcode = opcode_table[ctxt->b]; } - ctxt->d = opcode.flags; - if (ctxt->d & ModRM) + if (opcode.flags & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); - /* vex-prefix instructions are not implemented */ - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && - (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { - ctxt->d = NotImpl; - } - +done_modrm: + ctxt->d = opcode.flags; while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: @@ -4975,6 +5062,19 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int if (ctxt->d == 0) return EMULATION_FAILED; + if (unlikely(vex_prefix)) { + /* + * Only specifically marked instructions support VEX. Since many + * instructions support it but are not annotated, return not implemented + * rather than #UD. + */ + if (!(ctxt->d & Avx)) + return EMULATION_FAILED; + + if (!(ctxt->d & AlignMask)) + ctxt->d |= Unaligned; + } + ctxt->execute = opcode.u.execute; /* @@ -5045,7 +5145,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int if ((ctxt->d & No16) && ctxt->op_bytes == 2) ctxt->op_bytes = 4; - if (ctxt->d & Sse) + if (vex_prefix) + ; + else if (ctxt->d & Sse) ctxt->op_bytes = 16, ctxt->d &= ~Avx; else if (ctxt->d & Mmx) ctxt->op_bytes = 8; From ebec25438f3dbc79ea1ff274b1fe97d277b1545f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 13 Nov 2025 19:36:33 -0500 Subject: [PATCH 151/260] KVM: x86: Enable support for emulating AVX MOV instructions Some users of KVM have emulated devices (typically added to private forks of QEMU) that execute AVX instructions on PCI BARs. Whenever the guest OS tries to do that, an illegal instruction exception or emulation failure is triggered. Add the Avx flag to move instructions: - (66) 0f 10 - MOVUPS/MOVUPD from memory - (66) 0f 11 - MOVUPS/MOVUPD to memory - 66 0f 6f - MOVDQA from memory - 66 0f 7f - MOVDQA to memory - f3 0f 6f - MOVDQU from memory - f3 0f 7f - MOVDQU to memory - (66) 0f 28 - MOVAPS/MOVAPD from memory - (66) 0f 29 - MOVAPS/MOVAPD to memory - (66) 0f 2b - MOVNTPS/MOVNTPD to memory - 66 0f e7 - MOVNTDQ to memory - 66 0f 38 2a - MOVNTDQA to memory Co-developed-by: Keith Busch Signed-off-by: Keith Busch Link: https://lore.kernel.org/kvm/BD108C42-0382-4B17-B601-434A4BD038E7@fb.com/T/ Signed-off-by: Paolo Bonzini Link: https://patch.msgid.link/20251114003633.60689-11-pbonzini@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 63b83a2e0e87..c8e292e9a24d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4121,7 +4121,7 @@ static const struct group_dual group15 = { { } }; static const struct gprefix pfx_0f_6f_0f_7f = { - I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), + I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov), }; static const struct instr_dual instr_dual_0f_2b = { @@ -4141,7 +4141,7 @@ static const struct gprefix pfx_0f_28_0f_29 = { }; static const struct gprefix pfx_0f_e7_0f_38_2a = { - N, I(Sse, em_mov), N, N, + N, I(Sse | Avx, em_mov), N, N, }; static const struct escape escape_d9 = { { @@ -4354,8 +4354,8 @@ static const struct opcode twobyte_table[256] = { DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11), + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11), N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4371,9 +4371,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, check_dr_write), N, N, N, N, - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), - N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), From c09816f2afce0f89f176c4bc58dc57ec9f204998 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 20 Nov 2025 20:09:30 +0800 Subject: [PATCH 152/260] KVM: x86: Remove unused declaration kvm_mmu_may_ignore_guest_pat() Commit 3fee4837ef40 ("KVM: x86: remove shadow_memtype_mask") removed the functions but leave this declaration. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20251120120930.1448593-1-yuehaibing@huawei.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f63074048ec6..c272ef269b96 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -235,8 +235,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); - int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); From 0a9eb2afa185e84cd6c6b35aefb190e8f92c4c5c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:16 +0000 Subject: [PATCH 153/260] KVM: selftests: Extend vmx_close_while_nested_test to cover SVM Add SVM L1 code to run the nested guest, and allow the test to run with SVM as well as VMX. Reviewed-by: Jim Mattson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-4-yosry.ahmed@linux.dev [sean: rename to "nested_close_kvm_test" to provide nested_* sorting] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- ..._nested_test.c => nested_close_kvm_test.c} | 42 +++++++++++++++---- 2 files changed, 34 insertions(+), 10 deletions(-) rename tools/testing/selftests/kvm/x86/{vmx_close_while_nested_test.c => nested_close_kvm_test.c} (64%) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..89ad8c82a7b2 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -88,6 +88,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test +TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test @@ -111,7 +112,6 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_close_while_nested_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test diff --git a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c similarity index 64% rename from tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c rename to tools/testing/selftests/kvm/x86/nested_close_kvm_test.c index dad988351493..f001cb836bfa 100644 --- a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_close_while_nested - * * Copyright (C) 2019, Red Hat, Inc. * * Verify that nothing bad happens if a KVM user exits with open @@ -12,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include #include @@ -22,6 +21,8 @@ enum { PORT_L0_EXIT = 0x2000, }; +#define L2_GUEST_STACK_SIZE 64 + static void l2_guest_code(void) { /* Exit to L0 */ @@ -29,9 +30,8 @@ static void l2_guest_code(void) : : [port] "d" (PORT_L0_EXIT) : "rax"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_vmx_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -45,19 +45,43 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(0); } +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Prepare the VMCB for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(0); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; From e6bcdd21223835a6a5691af224c7c5ff4934436a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:17 +0000 Subject: [PATCH 154/260] KVM: selftests: Extend vmx_nested_tsc_scaling_test to cover SVM Add SVM L1 code to run the nested guest, and allow the test to run with SVM as well as VMX. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-5-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- ...aling_test.c => nested_tsc_scaling_test.c} | 48 +++++++++++++++++-- 2 files changed, 44 insertions(+), 6 deletions(-) rename tools/testing/selftests/kvm/x86/{vmx_nested_tsc_scaling_test.c => nested_tsc_scaling_test.c} (83%) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 89ad8c82a7b2..9ffa0d5c7654 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -91,6 +91,7 @@ TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test @@ -118,7 +119,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test -TEST_GEN_PROGS_x86 += x86/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c similarity index 83% rename from tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c rename to tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c index 1759fa5cb3f2..4260c9e4f489 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #include "kselftest.h" /* L2 is scaled up (from L1's perspective) by this factor */ @@ -79,7 +80,30 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC scaling for L2 */ + wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32); + + /* launch L2 */ + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint32_t control; @@ -116,11 +140,19 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva = 0; uint64_t tsc_start, tsc_end; uint64_t tsc_khz; @@ -129,7 +161,8 @@ int main(int argc, char *argv[]) uint64_t l1_tsc_freq = 0; uint64_t l2_tsc_freq = 0; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); @@ -152,8 +185,13 @@ int main(int argc, char *argv[]) printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); From 4d256d00e44e02fae84851729d70df2bc2ebe6e9 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:18 +0000 Subject: [PATCH 155/260] KVM: selftests: Move nested invalid CR3 check to its own test vmx_tsc_adjust_test currently verifies that a nested VMLAUNCH fails with an invalid CR3. This is irrelevant to TSC scaling, move it to a standalone test. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-6-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../kvm/x86/nested_invalid_cr3_test.c | 79 +++++++++++++++++++ .../selftests/kvm/x86/vmx_tsc_adjust_test.c | 10 --- 3 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 9ffa0d5c7654..5bfd242da1ca 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -91,6 +91,7 @@ TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test +TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c new file mode 100644 index 000000000000..e839bcf5f8ad --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * This test verifies that L1 fails to enter L2 with an invalid CR3, and + * succeeds otherwise. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + vmcall(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = vmreadz(GUEST_CR3); + vmwrite(GUEST_CR3, -1ull); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == + (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); + + /* Now restore CR3 and make sure L2 runs successfully */ + vmwrite(GUEST_CR3, save_cr3); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t guest_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_vmx_code); + vcpu_alloc_vmx(vm, &guest_gva); + vcpu_args_set(vcpu, 1, guest_gva); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c index 2ceb5c78c442..2dcc0306a0d9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c @@ -77,7 +77,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint32_t control; - uintptr_t save_cr3; GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); @@ -94,15 +93,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); - /* Jump into L2. First, test failure to load guest CR3. */ - save_cr3 = vmreadz(GUEST_CR3); - vmwrite(GUEST_CR3, -1ull); - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == - (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); - check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - vmwrite(GUEST_CR3, save_cr3); - GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); From 91423b041d3ca9f07ebc0c7859a20ac7eac8c755 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:19 +0000 Subject: [PATCH 156/260] KVM: selftests: Extend nested_invalid_cr3_test to cover SVM Add SVM L1 code to run the nested guest, and allow the test to run with SVM as well as VMX. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-7-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- .../kvm/x86/nested_invalid_cr3_test.c | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c index e839bcf5f8ad..a6b6da9cf7fe 100644 --- a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -7,6 +7,7 @@ */ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #include "kselftest.h" @@ -17,6 +18,28 @@ static void l2_guest_code(void) vmcall(); } +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = svm->vmcb->save.cr3; + svm->vmcb->save.cr3 = -1ull; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_ERR); + + /* Now restore CR3 and make sure L2 runs successfully */ + svm->vmcb->save.cr3 = save_cr3; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + static void l1_vmx_code(struct vmx_pages *vmx_pages) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; @@ -43,16 +66,30 @@ static void l1_vmx_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; vm_vaddr_t guest_gva = 0; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); - vm = vm_create_with_one_vcpu(&vcpu, l1_vmx_code); - vcpu_alloc_vmx(vm, &guest_gva); vcpu_args_set(vcpu, 1, guest_gva); for (;;) { From 3c40777f0ed81e8b8f7047319ad195e407614b69 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:20 +0000 Subject: [PATCH 157/260] KVM: selftests: Extend vmx_tsc_adjust_test to cover SVM Add SVM L1 code to run the nested guest, and allow the test to run with SVM as well as VMX. Reviewed-by: Jim Mattson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-8-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- ...adjust_test.c => nested_tsc_adjust_test.c} | 65 ++++++++++++------- 2 files changed, 43 insertions(+), 24 deletions(-) rename tools/testing/selftests/kvm/x86/{vmx_tsc_adjust_test.c => nested_tsc_adjust_test.c} (61%) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 5bfd242da1ca..3127983c1285 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test @@ -119,7 +120,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test -TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c similarity index 61% rename from tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c rename to tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c index 2dcc0306a0d9..2839f650e5c9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_tsc_adjust_test - * * Copyright (C) 2018, Google LLC. * * IA32_TSC_ADJUST test @@ -22,6 +20,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include #include @@ -35,6 +34,8 @@ #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) +#define L2_GUEST_STACK_SIZE 64 + enum { PORT_ABORT = 0x1000, PORT_REPORT, @@ -72,32 +73,47 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_guest_code(void *data) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - uint32_t control; + /* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); - GUEST_ASSERT(load_vmcs(vmx_pages)); + /* + * Run L2 with TSC_OFFSET. L2 will write to TSC, and L1 is not + * intercepting the write so it should update L1's TSC_ADJUST. + */ + if (this_cpu_has(X86_FEATURE_VMX)) { + struct vmx_pages *vmx_pages = data; + uint32_t control; - /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); - control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; - vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); - vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + } else { + struct svm_test_data *svm = data; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + } check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); - GUEST_DONE(); } @@ -109,16 +125,19 @@ static void report(int64_t val) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t nested_gva; struct kvm_vcpu *vcpu; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); - vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (;;) { struct ucall uc; From 28b2dced8ba4604b2cdd97c11d7fbd0fa99f9835 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:21 +0000 Subject: [PATCH 158/260] KVM: selftests: Stop hardcoding PAGE_SIZE in x86 selftests Use PAGE_SIZE instead of 4096. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-9-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86/hyperv_features.c | 2 +- tools/testing/selftests/kvm/x86/hyperv_ipi.c | 18 +++++++++--------- .../testing/selftests/kvm/x86/sev_smoke_test.c | 2 +- tools/testing/selftests/kvm/x86/state_test.c | 2 +- .../selftests/kvm/x86/userspace_io_test.c | 2 +- .../selftests/kvm/x86/vmx_dirty_log_test.c | 10 +++++----- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c index 99d327084172..130b9ce7e5dd 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86/hyperv_features.c @@ -94,7 +94,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { input = pgs_gpa; - output = pgs_gpa + 4096; + output = pgs_gpa + PAGE_SIZE; } else { input = output = 0; } diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c index 2b5b4bc6ef7e..ca61836c4e32 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c @@ -102,7 +102,7 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ ipi->vector = IPI_VECTOR; ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; - hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -116,13 +116,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 0; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -138,13 +138,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -160,14 +160,14 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -183,10 +183,10 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; - hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index 77256c89bb8d..86ad1c7d068f 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -104,7 +104,7 @@ static void test_sync_vmsa(uint32_t type, uint64_t policy) vm_sev_launch(vm, policy, NULL); /* This page is shared, so make it decrypted. */ - memset(hva, 0, 4096); + memset(hva, 0, PAGE_SIZE); vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 141b7fc0c965..f2c7a1c297e3 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -141,7 +141,7 @@ static void __attribute__((__flatten__)) guest_code(void *arg) if (this_cpu_has(X86_FEATURE_XSAVE)) { uint64_t supported_xcr0 = this_cpu_supported_xcr0(); - uint8_t buffer[4096]; + uint8_t buffer[PAGE_SIZE]; memset(buffer, 0xcc, sizeof(buffer)); diff --git a/tools/testing/selftests/kvm/x86/userspace_io_test.c b/tools/testing/selftests/kvm/x86/userspace_io_test.c index 9481cbcf284f..be7d72f3c029 100644 --- a/tools/testing/selftests/kvm/x86/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86/userspace_io_test.c @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; - memset((void *)run + run->io.data_offset, 0xaa, 4096); + memset((void *)run + run->io.data_offset, 0xaa, PAGE_SIZE); vcpu_regs_set(vcpu, ®s); } diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index fa512d033205..34a57fe747f6 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -122,15 +122,15 @@ static void test_vmx_dirty_log(bool enable_ept) if (enable_ept) { prepare_eptp(vmx, vm, 0); nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -153,9 +153,9 @@ static void test_vmx_dirty_log(bool enable_ept) } TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; From ff736dba478c2bcf8c8c8328ff8936b1e6d65e81 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:22 +0000 Subject: [PATCH 159/260] KVM: selftests: Remove the unused argument to prepare_eptp() eptp_memslot is unused, remove it. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-10-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 3 +-- tools/testing/selftests/kvm/lib/x86/memstress.c | 2 +- tools/testing/selftests/kvm/lib/x86/vmx.c | 3 +-- tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index edb3c391b982..96e2b4c630a9 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -568,8 +568,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 7f5d62a65c68..0b1f288ad556 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -63,7 +63,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); /* * Identity map the first 4G and the test region with 1G pages so that diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index d4d1208dd023..9906456af11f 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -534,8 +534,7 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) { TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 34a57fe747f6..98cb6bdab3e6 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -120,7 +120,7 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); nested_map_memslot(vmx, vm, 0); nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); From ae5b498b8da96749cdad7b5a013053d17d815a6e Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 28 Oct 2025 15:30:39 -0700 Subject: [PATCH 160/260] KVM: selftests: Use a loop to create guest page tables Walk the guest page tables via a loop when creating new mappings, instead of using unique variables for each level of the page tables. This simplifies the code and makes it easier to support 5-level paging in the future. Signed-off-by: Jim Mattson Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251028225827.2269128-2-jmattson@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/lib/x86/processor.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index b418502c5ecc..738f2a44083f 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -218,8 +218,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; + uint64_t *pte = &vm->pgd; + int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -243,20 +243,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); - if (*pml4e & PTE_LARGE_MASK) - return; - - pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); - if (*pdpe & PTE_LARGE_MASK) - return; - - pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); - if (*pde & PTE_LARGE_MASK) - return; + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + current_level, level); + if (*pte & PTE_LARGE_MASK) + return; + } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); From 2103a8baf5cb7e6169434e5f2cc6c311929f529a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 28 Oct 2025 15:30:40 -0700 Subject: [PATCH 161/260] KVM: selftests: Use a loop to walk guest page tables Walk the guest page tables via a loop when searching for a PTE, instead of using unique variables for each level of the page tables. This simplifies the code and makes it easier to support 5-level paging in the future. Signed-off-by: Jim Mattson Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251028225827.2269128-3-jmattson@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/lib/x86/processor.c | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 738f2a44083f..720c678187b5 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -307,7 +307,8 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte = &vm->pgd; + int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); @@ -328,19 +329,15 @@ uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), "Canonical check failed. The virtual address is invalid."); - pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); - if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) - return pml4e; + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_get_pte(vm, pte, vaddr, current_level); + if (vm_is_target_pte(pte, level, current_level)) + return pte; + } - pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); - if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) - return pdpe; - - pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); - if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) - return pde; - - return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) From ec5806639e39950527593e3be0efe7f0d7b65bf7 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 28 Oct 2025 15:30:41 -0700 Subject: [PATCH 162/260] KVM: selftests: Change VM_MODE_PXXV48_4K to VM_MODE_PXXVYY_4K Use 57-bit addresses with 5-level paging on hardware that supports LA57. Continue to use 48-bit addresses with 4-level paging on hardware that doesn't support LA57. Suggested-by: Sean Christopherson Signed-off-by: Jim Mattson Link: https://patch.msgid.link/20251028225827.2269128-4-jmattson@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/kvm_util.h | 4 +-- .../selftests/kvm/include/x86/processor.h | 2 +- .../selftests/kvm/lib/arm64/processor.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 30 ++++++++++--------- .../testing/selftests/kvm/lib/x86/processor.c | 30 +++++++++++-------- tools/testing/selftests/kvm/lib/x86/vmx.c | 6 ++-- 6 files changed, 40 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..8a54a1279d44 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -177,7 +177,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_16K, VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_PXXVYY_4K, /* For 48-bit or 57-bit VA, depending on host support */ VM_MODE_P47V64_4K, VM_MODE_P44V64_4K, VM_MODE_P36V48_4K, @@ -219,7 +219,7 @@ extern enum vm_guest_mode vm_mode_default; #elif defined(__x86_64__) -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define VM_MODE_DEFAULT VM_MODE_PXXVYY_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 51cd84b9ca66..57d62a425109 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1441,7 +1441,7 @@ enum pg_level { PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, - PG_LEVEL_NUM + PG_LEVEL_256T }; #define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 54f6d17c78f7..d46e4b13b92c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -324,7 +324,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..364efd02ad4a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -201,7 +201,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", - [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_PXXVYY_4K] = "PA-bits:ANY, VA-bits:48 or 57, 4K pages", [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", @@ -228,7 +228,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, - [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_PXXVYY_4K] = { 0, 0, 0x1000, 12 }, [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, @@ -310,24 +310,26 @@ struct kvm_vm *____vm_create(struct vm_shape shape) case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; break; - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); kvm_init_vm_address_properties(vm); - /* - * Ignore KVM support for 5-level paging (vm->va_bits == 57), - * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this mode (48-bit virtual address space). - */ - TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, - "Linear address width (%d bits) not supported", - vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", vm->pa_bits); - vm->pgtable_levels = 4; - vm->va_bits = 48; + pr_debug("Guest virtual address width detected: %d\n", + vm->va_bits); + + if (vm->va_bits == 57) { + vm->pgtable_levels = 5; + } else { + TEST_ASSERT(vm->va_bits == 48, + "Unexpected guest virtual address width: %d", + vm->va_bits); + vm->pgtable_levels = 4; + } #else - TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 720c678187b5..40bd69b265ef 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -158,10 +158,10 @@ bool kvm_is_tdp_enabled(void) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create page map l4 table. */ + /* If needed, create the top-level page table. */ if (!vm->pgd_created) { vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; @@ -221,8 +221,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) uint64_t *pte = &vm->pgd; int current_level; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, - "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % pg_size) == 0, "Virtual address not aligned,\n" @@ -307,27 +307,28 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { + int va_width = 12 + (vm->pgtable_levels) * 9; uint64_t *pte = &vm->pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); /* - * Based on the mode check above there are 48 bits in the vaddr, so - * shift 16 to sign extend the last bit (bit-47), + * Check that the vaddr is a sign-extended va_width value. */ - TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), - "Canonical check failed. The virtual address is invalid."); + TEST_ASSERT(vaddr == + (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), + "Canonical check failed. The virtual address is invalid."); for (current_level = vm->pgtable_levels; current_level > PG_LEVEL_4K; @@ -520,7 +521,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; - TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); @@ -534,6 +536,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; + if (vm->pgtable_levels == 5) + sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 9906456af11f..29b082a58daa 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -401,11 +401,11 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; uint16_t index; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx requires 5-level paging", + "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", nested_paddr); TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" From 6a8818de21d294c1de6be9a71afe184d08350875 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 28 Oct 2025 15:30:42 -0700 Subject: [PATCH 163/260] KVM: selftests: Add a VMX test for LA57 nested state Add a selftest that verifies KVM's ability to save and restore nested state when the L1 guest is using 5-level paging and the L2 guest is using 4-level paging. Specifically, canonicality tests of the VMCS12 host-state fields should accept 57-bit virtual addresses. Signed-off-by: Jim Mattson Link: https://patch.msgid.link/20251028225827.2269128-5-jmattson@google.com [sean: rename to vmx_nested_la57_state_test to prep nested_ namespace] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../kvm/x86/vmx_nested_la57_state_test.c | 132 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3127983c1285..7ebf30a87a2b 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -119,6 +119,7 @@ TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state +TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c new file mode 100644 index 000000000000..cf1d2d1f2a8f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * Test KVM's ability to save and restore nested state when the L1 guest + * is using 5-level paging and the L2 guest is using 4-level paging. + * + * This test would have failed prior to commit 9245fd6b8531 ("KVM: x86: + * model canonical checks more precisely"). + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define LA57_GS_BASE 0xff2bc0311fb00000ull + +static void l2_guest_code(void) +{ + /* + * Sync with L0 to trigger save/restore. After + * resuming, execute VMCALL to exit back to L1. + */ + GUEST_SYNC(1); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 guest_cr4; + vm_paddr_t pml5_pa, pml4_pa; + u64 *pml5; + u64 exit_reason; + + /* Set GS_BASE to a value that is only canonical with LA57. */ + wrmsr(MSR_GS_BASE, LA57_GS_BASE); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == LA57_GS_BASE); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Set up L2 with a 4-level page table by pointing its CR3 to + * L1's first PML4 table and clearing CR4.LA57. This creates + * the CR4.LA57 mismatch that exercises the bug. + */ + pml5_pa = get_cr3() & PHYSICAL_PAGE_MASK; + pml5 = (u64 *)pml5_pa; + pml4_pa = pml5[0] & PHYSICAL_PAGE_MASK; + vmwrite(GUEST_CR3, pml4_pa); + + guest_cr4 = vmreadz(GUEST_CR4); + guest_cr4 &= ~X86_CR4_LA57; + vmwrite(GUEST_CR4, guest_cr4); + + GUEST_ASSERT(!vmlaunch()); + + exit_reason = vmreadz(VM_EXIT_REASON); + GUEST_ASSERT(exit_reason == EXIT_REASON_VMCALL); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + l1_guest_code(vmx_pages); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_LA57)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* + * L1 needs to read its own PML5 table to set up L2. Identity map + * the PML5 table to facilitate this. + */ + virt_map(vm, vm->pgd, vm->pgd, 1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(uc.args[1] == stage, + "Expected stage %d, got stage %lu", stage, (ulong)uc.args[1]); + if (stage == 1) { + pr_info("L2 is active; performing save/restore.\n"); + state = vcpu_save_state(vcpu); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + } + } + +done: + kvm_vm_free(vm); + return 0; +} From 8e8678e740ecde2ae4a0404fd9b4ed2b726e236d Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 8 Jul 2025 12:57:57 +0000 Subject: [PATCH 164/260] KVM: s390: Add capability that forwards operation exceptions Setting KVM_CAP_S390_USER_OPEREXEC will forward all operation exceptions to user space. This also includes the 0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. It's helpful if user space wants to emulate instructions which do not (yet) have an opcode. While we're at it refine the documentation for KVM_CAP_S390_USER_INSTR0. Signed-off-by: Janosch Frank Reviewed-by: Claudio Imbrenda Acked-by: Christian Borntraeger Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 17 ++- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/intercept.c | 3 + arch/s390/kvm/kvm-s390.c | 7 + include/uapi/linux/kvm.h | 1 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/s390/user_operexec.c | 140 ++++++++++++++++++ 7 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/s390/user_operexec.c diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 72b2fae99a83..1bc2a84c59ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7820,7 +7820,7 @@ where 0xff represents CPUs 0-7 in cluster 0. :Architectures: s390 :Parameters: none -With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +With this capability enabled, the illegal instruction 0x0000 (2 bytes) will be intercepted and forwarded to user space. User space can use this mechanism e.g. to realize 2-byte software breakpoints. The kernel will not inject an operating exception for these instructions, user space has @@ -8703,6 +8703,21 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_S390_USER_OPEREXEC +------------------------------- + +:Architectures: s390 +:Parameters: none + +When this capability is enabled KVM forwards all operation exceptions +that it doesn't handle itself to user space. This also includes the +0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. This is +helpful if user space wants to emulate instructions which are not +(yet) implemented in hardware. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + 8. Other capabilities. ====================== diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 22cedcaea475..1e4829c70216 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -648,6 +648,7 @@ struct kvm_arch { int user_sigp; int user_stsi; int user_instr0; + int user_operexec; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index c7908950c1f4..420ae62977e2 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -471,6 +471,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0xb256) return handle_sthyi(vcpu); + if (vcpu->kvm->arch.user_operexec) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 70ebc54b1bb1..56d4730b7c41 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -606,6 +606,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_S390_USER_OPEREXEC: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -921,6 +922,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_USER_OPEREXEC: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC"); + kvm->arch.user_operexec = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..8ab07396ce3b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -963,6 +963,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_S390_USER_OPEREXEC 245 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..87e429206bb8 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -194,6 +194,7 @@ TEST_GEN_PROGS_s390 += s390/debug_test TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test +TEST_GEN_PROGS_s390 += s390/user_operexec TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c new file mode 100644 index 000000000000..714906c1d12a --- /dev/null +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test operation exception forwarding. + * + * Copyright IBM Corp. 2025 + * + * Authors: + * Janosch Frank + */ +#include "kselftest.h" +#include "kvm_util.h" +#include "test_util.h" +#include "sie.h" + +#include + +static void guest_code_instr0(void) +{ + asm(".word 0x0000"); +} + +static void test_user_instr0(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +static void guest_code_user_operexec(void) +{ + asm(".word 0x0807"); +} + +static void test_user_operexec(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* + * Since user_operexec is the superset it can be used for the + * 0 instruction. + */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +/* combine user_instr0 and user_operexec */ +static void test_user_operexec_combined(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* Reverse enablement order */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +/* + * Run all tests above. + * + * Enablement after VCPU has been added is automatically tested since + * we enable the capability after VCPU creation. + */ +static struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "instr0", test_user_instr0 }, + { "operexec", test_user_operexec }, + { "operexec_combined", test_user_operexec_combined}, +}; + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_USER_INSTR0)); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + ksft_finished(); +} From 44acac00be5dbda58f337acda41148d39743075c Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 7 Nov 2025 03:49:27 +0100 Subject: [PATCH 165/260] KVM: s390: vsie: Check alignment of BSCA header The VSIE code currently checks that the BSCA struct fits within a page, and returns a validity exception 0x003b if it doesn't. The BSCA is pinned in memory rather than shadowed (see block comment at end of kvm_s390_cpu_feat_init()), so enforcing the CPU entries to be on the same pinned page makes sense. Except those entries aren't going to be used below the guest, and according to the definition of that validity exception only the header of the BSCA (everything but the CPU entries) needs to be within a page. Adjust the alignment check to account for that. Signed-off-by: Eric Farman Reviewed-by: Christian Borntraeger Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 347268f89f2f..d23ab5120888 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -782,7 +782,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) rc = set_validity_icpt(scb_s, 0x0011U); else if ((gpa & PAGE_MASK) != - ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK)) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); From 1de4dc15baa1917adb3a0b369d00d818be9d9bab Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:23 +0000 Subject: [PATCH 166/260] KVM: selftests: Stop using __virt_pg_map() directly in tests Replace __virt_pg_map() calls in tests by high-level equivalent functions, removing some loops in the process. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-11-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/mmu_stress_test.c | 6 ++---- tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index c799e0d0694f..51c070556f3e 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -362,11 +362,9 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ - for (i = 0; i < slot_size; i += SZ_1G) - __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); + virt_map_level(vm, gpa, gpa, slot_size, PG_LEVEL_1G); #else - for (i = 0; i < slot_size; i += vm->page_size) - virt_pg_map(vm, gpa + i, gpa + i); + virt_map(vm, gpa, gpa, slot_size >> vm->page_shift); #endif } diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index 077cd0ec3040..a3b7ce155981 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -621,7 +621,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NTEST_PAGES; i++) { pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); - __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); } From d2e50389ab44acfa05e72604d701a70b234f9938 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 21 Oct 2025 07:47:24 +0000 Subject: [PATCH 167/260] KVM: selftests: Make sure vm->vpages_mapped is always up-to-date Call paths leading to __virt_pg_map() are currently: (a) virt_pg_map() -> virt_arch_pg_map() -> __virt_pg_map() (b) virt_map_level() -> __virt_pg_map() For (a), calls to virt_pg_map() from kvm_util.c make sure they update vm->vpages_mapped, but other callers do not. Move the sparsebit_set() call into virt_pg_map() to make sure all callers are captured. For (b), call sparsebit_set_num() from virt_map_level(). It's tempting to have a single the call inside __virt_pg_map(), however: - The call path in (a) is not x86-specific, while (b) is. Moving the call into __virt_pg_map() would require doing something similar for other archs implementing virt_pg_map(). - Future changes will reusue __virt_pg_map() for nested PTEs, which should not update vm->vpages_mapped, i.e. a triple underscore version that does not update vm->vpages_mapped would need to be provided. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251021074736.1324328-12-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 3 --- tools/testing/selftests/kvm/lib/x86/processor.c | 2 ++ 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 8a54a1279d44..d701ee557f5b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1230,6 +1230,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { virt_arch_pg_map(vm, vaddr, paddr); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 364efd02ad4a..33aebedfc050 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1458,8 +1458,6 @@ static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr); - - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } return vaddr_start; @@ -1573,7 +1571,6 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, while (npages--) { virt_pg_map(vm, vaddr, paddr); - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); vaddr += page_size; paddr += page_size; diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 40bd69b265ef..36104d27f3d9 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -286,6 +286,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, for (i = 0; i < nr_pages; i++) { __virt_pg_map(vm, vaddr, paddr, level); + sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, + nr_bytes / PAGE_SIZE); vaddr += pg_size; paddr += pg_size; From e2f3e2d37b065ada3bfdb3b22b1a960eb295c686 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 17 Oct 2025 21:29:22 +0530 Subject: [PATCH 168/260] RISC-V: KVM: Convert kvm_riscv_vcpu_sbi_forward() into extension handler All uses of kvm_riscv_vcpu_sbi_forward() also updates retdata->uexit so to further reduce code duplication move retdata->uexit assignment to kvm_riscv_vcpu_sbi_forward() and convert it into SBI extension handler. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20251017155925.361560-2-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 4 +++- arch/riscv/kvm/vcpu_sbi.c | 6 +++++- arch/riscv/kvm/vcpu_sbi_base.c | 20 +++----------------- arch/riscv/kvm/vcpu_sbi_replace.c | 27 +-------------------------- arch/riscv/kvm/vcpu_sbi_system.c | 4 +--- arch/riscv/kvm/vcpu_sbi_v01.c | 3 +-- 6 files changed, 14 insertions(+), 50 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 3497489e04db..446f4a8eb3cd 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -69,7 +69,9 @@ struct kvm_vcpu_sbi_extension { unsigned long reg_size, const void *reg_val); }; -void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_riscv_vcpu_sbi_forward_handler(struct kvm_vcpu *vcpu, + struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata); void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, struct kvm_run *run, u32 type, u64 flags); diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 1b13623380e1..fd4106c276d8 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -120,7 +120,9 @@ static bool riscv_vcpu_supports_sbi_ext(struct kvm_vcpu *vcpu, int idx) return sext && scontext->ext_status[sext->ext_idx] != KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; } -void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_riscv_vcpu_sbi_forward_handler(struct kvm_vcpu *vcpu, + struct kvm_run *run, + struct kvm_vcpu_sbi_return *retdata) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -137,6 +139,8 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) run->riscv_sbi.args[5] = cp->a5; run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED; run->riscv_sbi.ret[1] = 0; + retdata->uexit = true; + return 0; } void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 5bc570b984f4..ca489f2dfbdf 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -41,8 +41,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * For experimental/vendor extensions * forward it to the userspace */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - retdata->uexit = true; + return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); } else { sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a0); *out_val = sbi_ext && sbi_ext->probe ? @@ -72,27 +71,14 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { .handler = kvm_sbi_ext_base_handler, }; -static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu, - struct kvm_run *run, - struct kvm_vcpu_sbi_return *retdata) -{ - /* - * Both SBI experimental and vendor extensions are - * unconditionally forwarded to userspace. - */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - retdata->uexit = true; - return 0; -} - const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = { .extid_start = SBI_EXT_EXPERIMENTAL_START, .extid_end = SBI_EXT_EXPERIMENTAL_END, - .handler = kvm_sbi_ext_forward_handler, + .handler = kvm_riscv_vcpu_sbi_forward_handler, }; const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = { .extid_start = SBI_EXT_VENDOR_START, .extid_end = SBI_EXT_VENDOR_END, - .handler = kvm_sbi_ext_forward_handler, + .handler = kvm_riscv_vcpu_sbi_forward_handler, }; diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index b490ed1428a6..2c456e26f6ca 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -186,34 +186,9 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = { .handler = kvm_sbi_ext_srst_handler, }; -static int kvm_sbi_ext_dbcn_handler(struct kvm_vcpu *vcpu, - struct kvm_run *run, - struct kvm_vcpu_sbi_return *retdata) -{ - struct kvm_cpu_context *cp = &vcpu->arch.guest_context; - unsigned long funcid = cp->a6; - - switch (funcid) { - case SBI_EXT_DBCN_CONSOLE_WRITE: - case SBI_EXT_DBCN_CONSOLE_READ: - case SBI_EXT_DBCN_CONSOLE_WRITE_BYTE: - /* - * The SBI debug console functions are unconditionally - * forwarded to the userspace. - */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - retdata->uexit = true; - break; - default: - retdata->err_val = SBI_ERR_NOT_SUPPORTED; - } - - return 0; -} - const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { .extid_start = SBI_EXT_DBCN, .extid_end = SBI_EXT_DBCN, .default_disabled = true, - .handler = kvm_sbi_ext_dbcn_handler, + .handler = kvm_riscv_vcpu_sbi_forward_handler, }; diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c index 359be90b0fc5..c6f7e609ac79 100644 --- a/arch/riscv/kvm/vcpu_sbi_system.c +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -47,9 +47,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, kvm_riscv_vcpu_sbi_request_reset(vcpu, cp->a1, cp->a2); /* userspace provides the suspend implementation */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - retdata->uexit = true; - break; + return kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); default: retdata->err_val = SBI_ERR_NOT_SUPPORTED; break; diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 368dfddd23d9..188d5ea5b3b8 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -32,8 +32,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, * The CONSOLE_GETCHAR/CONSOLE_PUTCHAR SBI calls cannot be * handled in kernel so we forward these to user-space */ - kvm_riscv_vcpu_sbi_forward(vcpu, run); - retdata->uexit = true; + ret = kvm_riscv_vcpu_sbi_forward_handler(vcpu, run, retdata); break; case SBI_EXT_0_1_SET_TIMER: #if __riscv_xlen == 32 From 12fd6c62e9f63af9498596d81aa11eb8396e6ae2 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 17 Oct 2025 21:29:23 +0530 Subject: [PATCH 169/260] RISC-V: KVM: Add separate source for forwarded SBI extensions Add a separate source vcpu_sbi_forward.c for SBI extensions which are entirely forwarded to KVM user-space. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20251017155925.361560-3-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/vcpu_sbi_base.c | 12 ------------ arch/riscv/kvm/vcpu_sbi_forward.c | 27 +++++++++++++++++++++++++++ arch/riscv/kvm/vcpu_sbi_replace.c | 7 ------- 4 files changed, 28 insertions(+), 19 deletions(-) create mode 100644 arch/riscv/kvm/vcpu_sbi_forward.c diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 07197395750e..3b8afb038b35 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -27,6 +27,7 @@ kvm-y += vcpu_onereg.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o kvm-y += vcpu_sbi.o kvm-y += vcpu_sbi_base.o +kvm-y += vcpu_sbi_forward.o kvm-y += vcpu_sbi_fwft.o kvm-y += vcpu_sbi_hsm.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_sbi_pmu.o diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index ca489f2dfbdf..06fdd5f69364 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -70,15 +70,3 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = { .extid_end = SBI_EXT_BASE, .handler = kvm_sbi_ext_base_handler, }; - -const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = { - .extid_start = SBI_EXT_EXPERIMENTAL_START, - .extid_end = SBI_EXT_EXPERIMENTAL_END, - .handler = kvm_riscv_vcpu_sbi_forward_handler, -}; - -const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = { - .extid_start = SBI_EXT_VENDOR_START, - .extid_end = SBI_EXT_VENDOR_END, - .handler = kvm_riscv_vcpu_sbi_forward_handler, -}; diff --git a/arch/riscv/kvm/vcpu_sbi_forward.c b/arch/riscv/kvm/vcpu_sbi_forward.c new file mode 100644 index 000000000000..dbfa70c2c775 --- /dev/null +++ b/arch/riscv/kvm/vcpu_sbi_forward.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#include +#include +#include + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = { + .extid_start = SBI_EXT_EXPERIMENTAL_START, + .extid_end = SBI_EXT_EXPERIMENTAL_END, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = { + .extid_start = SBI_EXT_VENDOR_START, + .extid_end = SBI_EXT_VENDOR_END, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { + .extid_start = SBI_EXT_DBCN, + .extid_end = SBI_EXT_DBCN, + .default_disabled = true, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 2c456e26f6ca..506a510b6bff 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -185,10 +185,3 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = { .extid_end = SBI_EXT_SRST, .handler = kvm_sbi_ext_srst_handler, }; - -const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { - .extid_start = SBI_EXT_DBCN, - .extid_end = SBI_EXT_DBCN, - .default_disabled = true, - .handler = kvm_riscv_vcpu_sbi_forward_handler, -}; From 7050f1d79f1cfaf0de577995df412855de23f752 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 17 Oct 2025 21:29:24 +0530 Subject: [PATCH 170/260] RISC-V: KVM: Add SBI MPXY extension support for Guest The SBI MPXY extension is a platform-level functionality so KVM only needs to forward SBI MPXY calls to KVM user-space. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20251017155925.361560-4-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 1 + arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu_sbi.c | 4 ++++ arch/riscv/kvm/vcpu_sbi_forward.c | 7 +++++++ 4 files changed, 13 insertions(+) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 446f4a8eb3cd..c1a7e3b40d9c 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -107,6 +107,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_susp; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_sta; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_fwft; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_mpxy; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 759a4852c09a..37213d86c0d1 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -211,6 +211,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_STA, KVM_RISCV_SBI_EXT_SUSP, KVM_RISCV_SBI_EXT_FWFT, + KVM_RISCV_SBI_EXT_MPXY, KVM_RISCV_SBI_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index fd4106c276d8..46ab7b989432 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -82,6 +82,10 @@ static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { .ext_idx = KVM_RISCV_SBI_EXT_FWFT, .ext_ptr = &vcpu_sbi_ext_fwft, }, + { + .ext_idx = KVM_RISCV_SBI_EXT_MPXY, + .ext_ptr = &vcpu_sbi_ext_mpxy, + }, { .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, .ext_ptr = &vcpu_sbi_ext_experimental, diff --git a/arch/riscv/kvm/vcpu_sbi_forward.c b/arch/riscv/kvm/vcpu_sbi_forward.c index dbfa70c2c775..5a3c75eb23c5 100644 --- a/arch/riscv/kvm/vcpu_sbi_forward.c +++ b/arch/riscv/kvm/vcpu_sbi_forward.c @@ -25,3 +25,10 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_dbcn = { .default_disabled = true, .handler = kvm_riscv_vcpu_sbi_forward_handler, }; + +const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_mpxy = { + .extid_start = SBI_EXT_MPXY, + .extid_end = SBI_EXT_MPXY, + .default_disabled = true, + .handler = kvm_riscv_vcpu_sbi_forward_handler, +}; From d1c5620781d590b07543f8d31a5c87abf046c126 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 17 Oct 2025 21:29:25 +0530 Subject: [PATCH 171/260] KVM: riscv: selftests: Add SBI MPXY extension to get-reg-list The KVM RISC-V allows SBI MPXY extensions for Guest/VM so add it to the get-reg-list test. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20251017155925.361560-5-apatel@ventanamicro.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/get-reg-list.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 705ab3d7778b..cb54a56990a0 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -133,6 +133,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SUSP: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_FWFT: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_MPXY: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR: return true; @@ -639,6 +640,7 @@ static const char *sbi_ext_single_id_to_str(__u64 reg_off) KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SUSP), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_STA), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_FWFT), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_MPXY), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_EXPERIMENTAL), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_VENDOR), }; @@ -1142,6 +1144,7 @@ KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA); KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU); KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN); KVM_SBI_EXT_SIMPLE_CONFIG(susp, SUSP); +KVM_SBI_EXT_SIMPLE_CONFIG(mpxy, MPXY); KVM_SBI_EXT_SUBLIST_CONFIG(fwft, FWFT); KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA); @@ -1222,6 +1225,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_sbi_pmu, &config_sbi_dbcn, &config_sbi_susp, + &config_sbi_mpxy, &config_sbi_fwft, &config_aia, &config_fp_f, From a2483d5d1ee9b399d8137691cb0d8dc99cfe7684 Mon Sep 17 00:00:00 2001 From: BillXiang Date: Tue, 23 Sep 2025 13:38:51 +0800 Subject: [PATCH 172/260] RISC-V: KVM: Introduce KVM_EXIT_FAIL_ENTRY_NO_VSFILE Currently, we return CSR_HSTATUS as hardware_entry_failure_reason when kvm_riscv_aia_alloc_hgei failed in KVM_DEV_RISCV_AIA_MODE_HWACCEL mode, which is vague so it is better to return a well defined value KVM_EXIT_FAIL_ENTRY_NO_VSFILE provided via uapi/asm/kvm.h. Signed-off-by: BillXiang Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250923053851.32863-1-xiangwencheng@lanxincomputing.com Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 2 ++ arch/riscv/kvm/aia_imsic.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 37213d86c0d1..54f3ad7ed2e4 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -23,6 +23,8 @@ #define KVM_INTERRUPT_SET -1U #define KVM_INTERRUPT_UNSET -2U +#define KVM_EXIT_FAIL_ENTRY_NO_VSFILE (1ULL << 0) + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { }; diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index 11422cb95a64..e597e86491c3 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -814,7 +814,7 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) /* For HW acceleration mode, we can't continue */ if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) { run->fail_entry.hardware_entry_failure_reason = - CSR_HSTATUS; + KVM_EXIT_FAIL_ENTRY_NO_VSFILE; run->fail_entry.cpu = vcpu->cpu; run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; From df60cb2e67029e07e23c4fdf9e027aaf1f63cc1a Mon Sep 17 00:00:00 2001 From: Dong Yang Date: Mon, 3 Nov 2025 14:28:25 +0800 Subject: [PATCH 173/260] KVM: riscv: Support enabling dirty log gradually in small chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is already support of enabling dirty log gradually in small chunks for x86 in commit 3c9bd4006bfc ("KVM: x86: enable dirty log gradually in small chunks") and c862626 ("KVM: arm64: Support enabling dirty log gradually in small chunks"). This adds support for riscv. x86 and arm64 writes protect both huge pages and normal pages now, so riscv protect also protects both huge pages and normal pages. On a nested virtualization setup (RISC-V KVM running inside a QEMU VM on an [Intel® Core™ i5-12500H] host), I did some tests with a 2G Linux VM using different backing page sizes. The time taken for memory_global_dirty_log_start in the L2 QEMU is listed below: Page Size Before After Optimization 4K 4490.23ms 31.94ms 2M 48.97ms 45.46ms 1G 28.40ms 30.93ms Signed-off-by: Quan Zhou Signed-off-by: Dong Yang Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251103062825.9084-1-dayss1224@gmail.com Signed-off-by: Anup Patel --- Documentation/virt/kvm/api.rst | 2 +- arch/riscv/include/asm/kvm_host.h | 3 +++ arch/riscv/kvm/mmu.c | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..3b621c3ae67c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8028,7 +8028,7 @@ will be initialized to 1 when created. This also improves performance because dirty logging can be enabled gradually in small chunks on the first call to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on -x86 and arm64 for now). +x86, arm64 and riscv for now). KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 4d794573e3db..e30548a4ab60 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -59,6 +59,9 @@ BIT(IRQ_VS_TIMER) | \ BIT(IRQ_VS_EXT)) +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + struct kvm_vm_stat { struct kvm_vm_stat_generic generic; }; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 58f5f3536ffd..4ab06697bfc0 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -161,8 +161,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * allocated dirty_bitmap[], dirty pages will be tracked while * the memory slot is write protected. */ - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) + if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; mmu_wp_memory_region(kvm, new->id); + } } int kvm_arch_prepare_memory_region(struct kvm *kvm, From 974555d6e417974e63444266e495a06d06c23af5 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Fri, 21 Nov 2025 21:35:43 +0800 Subject: [PATCH 174/260] RISC-V: KVM: Fix guest page fault within HLV* instructions When executing HLV* instructions at the HS mode, a guest page fault may occur when a g-stage page table migration between triggering the virtual instruction exception and executing the HLV* instruction. This may be a corner case, and one simpler way to handle this is to re-execute the instruction where the virtual instruction exception occurred, and the guest page fault will be automatically handled. Fixes: b91f0e4cb8a3 ("RISC-V: KVM: Factor-out instruction emulation into separate sources") Signed-off-by: Fangyu Yu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251121133543.46822-1-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_insn.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index de1f96ea6225..4d89b94128ae 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -298,6 +298,22 @@ static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, return (rc <= 0) ? rc : 1; } +static bool is_load_guest_page_fault(unsigned long scause) +{ + /** + * If a g-stage page fault occurs, the direct approach + * is to let the g-stage page fault handler handle it + * naturally, however, calling the g-stage page fault + * handler here seems rather strange. + * Considering this is a corner case, we can directly + * return to the guest and re-execute the same PC, this + * will trigger a g-stage page fault again and then the + * regular g-stage page fault handler will populate + * g-stage page table. + */ + return (scause == EXC_LOAD_GUEST_PAGE_FAULT); +} + /** * kvm_riscv_vcpu_virtual_insn -- Handle virtual instruction trap * @@ -323,6 +339,8 @@ int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); return 1; @@ -378,6 +396,8 @@ int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run, insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; /* Redirect trap if we failed to read instruction */ utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); @@ -504,6 +524,8 @@ int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run, insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, &utrap); if (utrap.scause) { + if (is_load_guest_page_fault(utrap.scause)) + return 1; /* Redirect trap if we failed to read instruction */ utrap.sepc = ct->sepc; kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); From 3239c52fd21257c80579875e74c9956c2f9cd1f9 Mon Sep 17 00:00:00 2001 From: Hui Min Mina Chou Date: Mon, 17 Nov 2025 16:45:55 +0800 Subject: [PATCH 175/260] RISC-V: KVM: Flush VS-stage TLB after VCPU migration for Andes cores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most implementations cache the combined result of two-stage translation, but some, like Andes cores, use split TLBs that store VS-stage and G-stage entries separately. On such systems, when a VCPU migrates to another CPU, an additional HFENCE.VVMA is required to avoid using stale VS-stage entries, which could otherwise cause guest faults. Introduce a static key to identify CPUs with split two-stage TLBs. When enabled, KVM issues an extra HFENCE.VVMA on VCPU migration to prevent stale VS-stage mappings. Signed-off-by: Hui Min Mina Chou Signed-off-by: Ben Zong-You Xie Reviewed-by: Radim Krčmář Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20251117084555.157642-1-minachou@andestech.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 3 +++ arch/riscv/include/asm/kvm_tlb.h | 1 + arch/riscv/include/asm/kvm_vmid.h | 1 - arch/riscv/kvm/main.c | 14 ++++++++++++++ arch/riscv/kvm/tlb.c | 30 ++++++++++++++++++++++++++++++ arch/riscv/kvm/vcpu.c | 2 +- arch/riscv/kvm/vmid.c | 23 ----------------------- 7 files changed, 49 insertions(+), 25 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index e30548a4ab60..24585304c02b 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -330,4 +330,7 @@ bool kvm_riscv_vcpu_stopped(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu); +/* Flags representing implementation specific details */ +DECLARE_STATIC_KEY_FALSE(kvm_riscv_vsstage_tlb_no_gpa); + #endif /* __RISCV_KVM_HOST_H__ */ diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h index 38a2f933ad3a..a0e7099bcb85 100644 --- a/arch/riscv/include/asm/kvm_tlb.h +++ b/arch/riscv/include/asm/kvm_tlb.h @@ -49,6 +49,7 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, unsigned long gva, unsigned long gvsz, unsigned long order); void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu); void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/include/asm/kvm_vmid.h b/arch/riscv/include/asm/kvm_vmid.h index ab98e1434fb7..db61b0525a8d 100644 --- a/arch/riscv/include/asm/kvm_vmid.h +++ b/arch/riscv/include/asm/kvm_vmid.h @@ -22,6 +22,5 @@ unsigned long kvm_riscv_gstage_vmid_bits(void); int kvm_riscv_gstage_vmid_init(struct kvm *kvm); bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); -void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu); #endif diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 77dc1655b442..45536af521f0 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -15,6 +15,18 @@ #include #include +DEFINE_STATIC_KEY_FALSE(kvm_riscv_vsstage_tlb_no_gpa); + +static void kvm_riscv_setup_vendor_features(void) +{ + /* Andes AX66: split two-stage TLBs */ + if (riscv_cached_mvendorid(0) == ANDES_VENDOR_ID && + (riscv_cached_marchid(0) & 0xFFFF) == 0x8A66) { + static_branch_enable(&kvm_riscv_vsstage_tlb_no_gpa); + kvm_info("VS-stage TLB does not cache guest physical address and VMID\n"); + } +} + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -160,6 +172,8 @@ static int __init riscv_kvm_init(void) kvm_info("AIA available with %d guest external interrupts\n", kvm_riscv_aia_nr_hgei); + kvm_riscv_setup_vendor_features(); + kvm_register_perf_callbacks(NULL); rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 3c5a70a2b927..ff1aeac4eb8e 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -158,6 +158,36 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid) csr_write(CSR_HGATP, hgatp); } +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) +{ + unsigned long vmid; + + if (!kvm_riscv_gstage_vmid_bits() || + vcpu->arch.last_exit_cpu == vcpu->cpu) + return; + + /* + * On RISC-V platforms with hardware VMID support, we share same + * VMID for all VCPUs of a particular Guest/VM. This means we might + * have stale G-stage TLB entries on the current Host CPU due to + * some other VCPU of the same Guest which ran previously on the + * current Host CPU. + * + * To cleanup stale TLB entries, we simply flush all G-stage TLB + * entries by VMID whenever underlying Host CPU changes for a VCPU. + */ + + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + kvm_riscv_local_hfence_gvma_vmid_all(vmid); + + /* + * Flush VS-stage TLB entries for implementation where VS-stage + * TLB does not cahce guest physical address and VMID. + */ + if (static_branch_unlikely(&kvm_riscv_vsstage_tlb_no_gpa)) + kvm_riscv_local_hfence_vvma_all(vmid); +} + void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) { kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 5ce35aba6069..9f07a3177a28 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -968,7 +968,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Note: This should be done after G-stage VMID has been * updated using kvm_riscv_gstage_vmid_ver_changed() */ - kvm_riscv_gstage_vmid_sanitize(vcpu); + kvm_riscv_local_tlb_sanitize(vcpu); trace_kvm_entry(vcpu); diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index abb1c2bf2542..cf34d448289d 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -122,26 +122,3 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu) kvm_for_each_vcpu(i, v, vcpu->kvm) kvm_make_request(KVM_REQ_UPDATE_HGATP, v); } - -void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu) -{ - unsigned long vmid; - - if (!kvm_riscv_gstage_vmid_bits() || - vcpu->arch.last_exit_cpu == vcpu->cpu) - return; - - /* - * On RISC-V platforms with hardware VMID support, we share same - * VMID for all VCPUs of a particular Guest/VM. This means we might - * have stale G-stage TLB entries on the current Host CPU due to - * some other VCPU of the same Guest which ran previously on the - * current Host CPU. - * - * To cleanup stale TLB entries, we simply flush all G-stage TLB - * entries by VMID whenever underlying Host CPU changes for a VCPU. - */ - - vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); - kvm_riscv_local_hfence_gvma_vmid_all(vmid); -} From dc31124379b69a758af740bbd981e9e9f04a61d5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:43 -0800 Subject: [PATCH 176/260] arm64: Detect FEAT_XNX Detect the feature in anticipation of using it in KVM. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-2-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpufeature.c | 7 +++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 8 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..aa3ecae252d3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3088,6 +3088,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_GICV5_LEGACY, .matches = test_has_gicv5_legacy, }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1b32c1232d28..ee74199107d3 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -64,6 +64,7 @@ HAS_TLB_RANGE HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT +HAS_XNX HAFT HW_DBM KVM_HVHE From 2608563b466b9192a9356b18463005da6e138bf9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:44 -0800 Subject: [PATCH 177/260] KVM: arm64: Add support for FEAT_XNX stage-2 permissions FEAT_XNX adds support for encoding separate execute permissions for EL0 and EL1 at stage-2. Add support for this to the page table library, hiding the unintuitive encoding scheme behind generic pX and uX permission flags. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-3-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 15 ++++--- arch/arm64/kvm/hyp/pgtable.c | 58 ++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..c72149a607d6 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -89,7 +89,7 @@ typedef u64 kvm_pte_t; #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) @@ -251,12 +251,15 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { - KVM_PGTABLE_PROT_X = BIT(0), - KVM_PGTABLE_PROT_W = BIT(1), - KVM_PGTABLE_PROT_R = BIT(2), + KVM_PGTABLE_PROT_PX = BIT(0), + KVM_PGTABLE_PROT_UX = BIT(1), + KVM_PGTABLE_PROT_X = KVM_PGTABLE_PROT_PX | + KVM_PGTABLE_PROT_UX, + KVM_PGTABLE_PROT_W = BIT(2), + KVM_PGTABLE_PROT_R = BIT(3), - KVM_PGTABLE_PROT_DEVICE = BIT(3), - KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), + KVM_PGTABLE_PROT_DEVICE = BIT(4), + KVM_PGTABLE_PROT_NORMAL_NC = BIT(5), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..e1d75f965027 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -661,11 +661,37 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} + static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { @@ -685,8 +711,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr = KVM_S2_MEMATTR(pgt, NORMAL); } - if (!(prot & KVM_PGTABLE_PROT_X)) - attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -715,8 +742,19 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) - prot |= KVM_PGTABLE_PROT_X; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + } return prot; } @@ -1290,9 +1328,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { - int ret; + kvm_pte_t xn = 0, set = 0, clr = 0; s8 level; - kvm_pte_t set = 0, clr = 0; + int ret; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; @@ -1303,8 +1341,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - if (prot & KVM_PGTABLE_PROT_X) - clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; + + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) From d93febe2ed2e0491af9d47f0ee6d4b01918877f4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:45 -0800 Subject: [PATCH 178/260] KVM: arm64: nv: Forward FEAT_XNX permissions to the shadow stage-2 Add support for FEAT_XNX to shadow stage-2 MMUs, being careful to only evaluate XN[0] when the feature is actually exposed to the VM. Restructure the layering of permissions in the fault handler to assume pX and uX then restricting based on the guest's stage-2 afterwards. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-4-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 37 +++++++++++++++++++++++++++-- arch/arm64/kvm/mmu.c | 23 ++++++++++++++---- arch/arm64/kvm/nested.c | 5 +++- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index f7c06a840963..5d967b60414c 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -120,9 +120,42 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) return trans->writable; } -static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) +static inline bool kvm_has_xnx(struct kvm *kvm) { - return !(trans->desc & BIT(54)); + return cpus_have_final_cap(ARM64_HAS_XNX) && + kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP); +} + +static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b01: + return true; + default: + return false; + } +} + +static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b11: + return true; + default: + return false; + } } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..96f1786c72fe 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1521,6 +1521,16 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, *prot |= kvm_encode_nested_level(nested); } +static void adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot) +{ + if (!kvm_s2_trans_exec_el0(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_UX; + if (!kvm_s2_trans_exec_el1(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_PX; +} + #define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1572,11 +1582,12 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) prot |= KVM_PGTABLE_PROT_W; - if (exec_fault || - (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested)))) + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) prot |= KVM_PGTABLE_PROT_X; + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + kvm_fault_lock(kvm); if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; @@ -1851,11 +1862,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested))) { + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; } + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f04cda40545b..92b2a69f0b89 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -788,7 +788,10 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) return 0; if (kvm_vcpu_trap_is_iabt(vcpu)) { - forward_fault = !kvm_s2_trans_executable(trans); + if (vcpu_mode_priv(vcpu)) + forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans); + else + forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans); } else { bool write_fault = kvm_is_write_fault(vcpu); From 8cb4ecec5e366b7dbbf200629a22624ad2340af5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:51 +0000 Subject: [PATCH 179/260] irqchip/gic: Add missing GICH_HCR control bits The GICH_HCR description is missing a bunch of control bits that control the maintenance interrupt. Add them. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-2-maz@kernel.org Signed-off-by: Oliver Upton --- include/linux/irqchip/arm-gic.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 2223f95079ce..d45fa19f9e47 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -86,7 +86,13 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_LRENPIE (1 << 2) #define GICH_HCR_NPIE (1 << 3) +#define GICH_HCR_VGrp0EIE (1 << 4) +#define GICH_HCR_VGrp0DIE (1 << 5) +#define GICH_HCR_VGrp1EIE (1 << 6) +#define GICH_HCR_VGrp1DIE (1 << 7) +#define GICH_HCR_EOICOUNT GENMASK(31, 27) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) From fa8f11e8e18383d234c77ba08d347aed7883d39a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:52 +0000 Subject: [PATCH 180/260] irqchip/gic: Expose CPU interface VA to KVM Future changes will require KVM to be able to perform deactivations by writing to the physical CPU interface. Add the corresponding VA to the kvm_info structure, and let KVM stash it. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-3-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 1 + drivers/irqchip/irq-gic.c | 3 +++ include/kvm/arm_vgic.h | 3 +++ include/linux/irqchip/arm-vgic-info.h | 2 ++ 4 files changed, 9 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 381673f03c39..441efef80d60 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -385,6 +385,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.can_emulate_gicv2 = true; kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.gicc_base = info->gicc_base; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1269ab8eb726..ec70c84e9f91 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1459,6 +1459,8 @@ static void __init gic_of_setup_kvm_info(struct device_node *node) if (ret) return; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; + if (static_branch_likely(&supports_deactivate_key)) vgic_set_kvm_info(&gic_v2_kvm_info); } @@ -1620,6 +1622,7 @@ static void __init gic_acpi_setup_kvm_info(void) return; gic_v2_kvm_info.maint_irq = irq; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; vgic_set_kvm_info(&gic_v2_kvm_info); } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7a0b972eb1b1..577723f5599b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -59,6 +59,9 @@ struct vgic_global { /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; + /* Physical CPU interface, kernel VA */ + void __iomem *gicc_base; + /* Number of implemented list registers */ int nr_lr; diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a470a73a805a..67d9d960273b 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -24,6 +24,8 @@ struct gic_kvm_info { enum gic_type type; /* Virtual CPU interface */ struct resource vcpu; + /* GICv2 GICC VA */ + void __iomem *gicc_base; /* Interrupt number */ unsigned int maint_irq; /* No interrupt mask, no need to use the above field */ From 08f4f41c1e95ffb1ce525a07d25daa577110d748 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:53 +0000 Subject: [PATCH 181/260] irqchip/apple-aic: Spit out ICH_MISR_EL2 value on spurious vGIC MI It is all good and well to scream about spurious vGIC maintenance interrupts. It would be even better to output the reason why, which is already checked, but not printed out. The unsuspecting kernel tinkerer thanks you. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-4-maz@kernel.org Signed-off-by: Oliver Upton --- drivers/irqchip/irq-apple-aic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 032d66dceb8e..4607f4943b19 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -411,12 +411,15 @@ static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs) if (is_kernel_in_hyp_mode() && (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && read_sysreg_s(SYS_ICH_MISR_EL2) != 0) { + u64 val; + generic_handle_domain_irq(aic_irqc->hw_domain, AIC_FIQ_HWIRQ(AIC_VGIC_MI)); if (unlikely((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && - read_sysreg_s(SYS_ICH_MISR_EL2))) { - pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n"); + (val = read_sysreg_s(SYS_ICH_MISR_EL2)))) { + pr_err_ratelimited("vGIC IRQ fired and not handled by KVM (MISR=%llx), disabling.\n", + val); sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); } } From 8d3dfab1d305d61359454d9c09b736f077a9fce4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:54 +0000 Subject: [PATCH 182/260] KVM: arm64: Turn vgic-v3 errata traps into a patched-in constant The trap bits are currently only set to manage CPU errata. However, we are about to make use of them for purposes beyond beating broken CPUs into submission. For this purpose, turn these errata-driven bits into a patched-in constant that is merged with the KVM-driven value at the point of programming the ICH_HCR_EL2 register, rather than being directly stored with with the shadow value.. This allows the KVM code to distinguish between a trap being handled for the purpose of an erratum workaround, or for KVM's own need. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 21 ++++--- arch/arm64/kvm/vgic/vgic-v3-nested.c | 12 +--- arch/arm64/kvm/vgic/vgic-v3.c | 83 +++++++++++++++++----------- arch/arm64/kvm/vgic/vgic.h | 16 ++++++ 5 files changed, 84 insertions(+), 49 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5369763606e7..85bc629270bd 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -91,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index acd909b7f225..e72d436dd6a3 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -14,6 +14,8 @@ #include #include +#include "../../vgic/vgic.h" + #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) @@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n) return val; } +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; @@ -218,7 +225,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); + write_gicreg(0, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) @@ -237,7 +244,7 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) int i; if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); @@ -307,14 +314,14 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected. Note that this also applies if we don't expect - * any system register access (no vgic at all). + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 7f1259b49c50..1fc9e0780abe 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -298,19 +298,9 @@ static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val = 0; int i; - /* - * If we're on a system with a broken vgic that requires - * trapping, propagate the trapping requirements. - * - * Ah, the smell of rotten fruits... - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap)) - val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | - ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); - s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); s_cpu_if->vgic_sre = host_if->vgic_sre; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6fbb4b099855..8c1494508682 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -301,20 +301,9 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) return; /* Hide GICv3 sysreg if necessary */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); - return; - } - - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; - if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -635,8 +624,50 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && - is_midr_in_range_list(broken_seis)); + return (is_kernel_in_hyp_mode() && + is_midr_in_range_list(broken_seis) && + (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS)); +} + +void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ + u32 insn, oinsn, rd; + u64 hcr = 0; + + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (vgic_v3_broken_seis()) { + /* We know that these machines have ICH_HCR_EL2.TDIR */ + group0_trap = true; + group1_trap = true; + dir_trap = true; + } + + if (group0_trap) + hcr |= ICH_HCR_EL2_TALL0; + if (group1_trap) + hcr |= ICH_HCR_EL2_TALL1; + if (common_trap) + hcr |= ICH_HCR_EL2_TC; + if (dir_trap) + hcr |= ICH_HCR_EL2_TDIR; + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)hcr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr = cpu_to_le32(insn); } /** @@ -650,6 +681,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; + u64 traps; int ret; has_v2 = ich_vtr_el2 >> 63; @@ -708,29 +740,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (has_v2) static_branch_enable(&vgic_v3_has_v2_compat); - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } - if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; - group0_trap = true; - group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_EL2_TDS) - dir_trap = true; - else - common_trap = true; } - if (group0_trap || group1_trap || common_trap | dir_trap) { + traps = vgic_ich_hcr_trap_bits(); + if (traps) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : "", - dir_trap ? "D" : ""); + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ac5f9c5d2b98..0ecadfa00397 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -164,6 +164,22 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } +void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); + +static inline u64 vgic_ich_hcr_trap_bits(void) +{ + u64 hcr; + + /* All the traps are in the bottom 16bits */ + asm volatile(ALTERNATIVE_CB("movz %0, #0\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_ich_hcr_trap_bits) + : "=r" (hcr)); + + return hcr; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC From 567ebfedb5bd204a8ce6a11695f02730f1bf57f4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:55 +0000 Subject: [PATCH 183/260] KVM: arm64: vgic-v3: Fix GICv3 trapping in protected mode As we are about to start trapping a bunch of extra things, augment the pKVM trap description with all the registers trapped by ICH_HCR_EL2.TC, making them legal instead of resulting in a UNDEF injection in the guest. While we're at it, ensure that pKVM captures the vgic model so that it can be checked by the emulation code. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++ arch/arm64/kvm/hyp/nvhe/sys_regs.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 43bde061b65d..8911338961c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -337,6 +337,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* CTR_EL0 is always under host control, even for protected VMs. */ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 82da9b03692d..3108b5185c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -444,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Scalable Vector Registers are restricted. */ + HOST_HANDLED(SYS_ICC_PMR_EL1), + RAZ_WI(SYS_ERRIDR_EL1), RAZ_WI(SYS_ERRSELR_EL1), RAZ_WI(SYS_ERXFR_EL1), @@ -457,9 +459,12 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Limited Ordering Regions Registers are restricted. */ + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), HOST_HANDLED(SYS_ICC_SGI1R_EL1), HOST_HANDLED(SYS_ICC_ASGI1R_EL1), HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, HOST_HANDLED(SYS_CCSIDR_EL1), From 2a28810cbb8b21a4016182617cc1fd72eddf4a36 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:56 +0000 Subject: [PATCH 184/260] KVM: arm64: GICv3: Detect and work around the lack of ICV_DIR_EL1 trapping A long time ago, an unsuspecting architect forgot to add a trap bit for ICV_DIR_EL1 in ICH_HCR_EL2. Which was unfortunate, but what's a bit of spec between friends? Thankfully, this was fixed in a later revision, and ARM "deprecates" the lack of trapping ability. Unfortuantely, a few (billion) CPUs went out with that defect, anything ARMv8.0 from ARM, give or take. And on these CPUs, you can't trap DIR on its own, full stop. As the next best thing, we can trap everything in the common group, which is a tad expensive, but hey ho, that's what you get. You can otherwise recycle the HW in the neaby bin. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/virt.h | 7 ++++- arch/arm64/kernel/cpufeature.c | 52 ++++++++++++++++++++++++++++++++++ arch/arm64/kernel/hyp-stub.S | 5 ++++ arch/arm64/kvm/vgic/vgic-v3.c | 3 ++ arch/arm64/tools/cpucaps | 1 + 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index aa280f356b96..8eb63d329497 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -40,8 +40,13 @@ */ #define HVC_FINALISE_EL2 3 +/* + * HVC_GET_ICH_VTR_EL2 - Retrieve the ICH_VTR_EL2 value + */ +#define HVC_GET_ICH_VTR_EL2 4 + /* Max number of HYP stub hypercalls */ -#define HVC_STUB_HCALL_NR 4 +#define HVC_STUB_HCALL_NR 5 /* Error returned when an invalid stub number is passed into x0 */ #define HVC_STUB_ERR 0xbadca11 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..5de51cb1b8fe 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2303,6 +2303,49 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry } #endif +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + if (!is_hyp_mode_available()) + return false; + + if (cpus_have_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { @@ -2814,6 +2857,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_gic_prio_relaxed_sync, }, #endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = can_trap_icv_dir_el1, + }, #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 36e2d26b54f5..085bc9972f6b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync) 1: cmp x0, #HVC_FINALISE_EL2 b.eq __finalise_el2 + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 8c1494508682..1b6c3071ec80 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -648,6 +648,9 @@ void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, dir_trap = true; } + if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR)) + common_trap = true; + if (group0_trap) hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1b32c1232d28..116d1a7b688c 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -40,6 +40,7 @@ HAS_GICV5_CPUIF HAS_GICV5_LEGACY HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC +HAS_ICH_HCR_EL2_TDIR HAS_HCR_NV1 HAS_HCX HAS_LDAPR From a4413a7c31cfca49d3f4830cf8a45edf4a713f63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:57 +0000 Subject: [PATCH 185/260] KVM: arm64: Repack struct vgic_irq fields struct vgic_irq has grown over the years, in a rather bad way. Repack it using bitfields so that the individual flags, and move things around a bit so that it a bit smaller. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v4.c | 5 ++++- include/kvm/arm_vgic.h | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 548aec9d5a72..09c3e9eb23f8 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; + bool pending; int ret; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) irq->hw = false; ret = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, - &irq->pending_latch); + &pending); WARN_ON(ret); + irq->pending_latch = pending; + desc = irq_to_desc(irq->host_irq); irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); unlock: diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 577723f5599b..e84a1bc5cf17 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -123,6 +123,7 @@ struct irq_ops { struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ + u32 intid; /* Guest visible INTID */ struct rcu_head rcu; struct list_head ap_list; @@ -137,17 +138,17 @@ struct vgic_irq { * affinity reg (v3). */ - u32 intid; /* Guest visible INTID */ - bool line_level; /* Level only */ - bool pending_latch; /* The pending latch state used to calculate - * the pending state for both level - * and edge triggered IRQs. */ - bool active; - bool pending_release; /* Used for LPIs only, unreferenced IRQ + bool pending_release:1; /* Used for LPIs only, unreferenced IRQ * pending a release */ - bool enabled; - bool hw; /* Tied to HW IRQ */ + bool pending_latch:1; /* The pending latch state used to calculate + * the pending state for both level + * and edge triggered IRQs. */ + enum vgic_irq_config config:1; /* Level or edge */ + bool line_level:1; /* Level only */ + bool enabled:1; + bool active:1; + bool hw:1; /* Tied to HW IRQ */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ @@ -159,7 +160,6 @@ struct vgic_irq { u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ - enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; From 879a7fd4fd64656d953f887e6a18e13e0b9a9f8f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:58 +0000 Subject: [PATCH 186/260] KVM: arm64: Add tracking of vgic_irq being present in a LR We currently cannot identify whether an interrupt is queued into a LR. It wasn't needed until now, but that's about to change. Add yet another flag to track that state. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 6 ++++++ arch/arm64/kvm/vgic/vgic-v3.c | 6 ++++++ include/kvm/arm_vgic.h | 1 + 3 files changed, 13 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 441efef80d60..74efacba38d4 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -101,6 +101,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); + irq->on_lr = false; + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -124,6 +126,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) u32 val = irq->intid; bool allow_pending = true; + WARN_ON(irq->on_lr); + if (irq->active) { val |= GICH_LR_ACTIVE_BIT; if (vgic_irq_is_sgi(irq->intid)) @@ -194,6 +198,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + irq->on_lr = true; + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 1b6c3071ec80..e3f4b27e0225 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -97,6 +97,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + irq->on_lr = false; + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -111,6 +113,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; + WARN_ON(irq->on_lr); + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); @@ -185,6 +189,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + irq->on_lr = true; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index e84a1bc5cf17..ec349c5a4a8b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -149,6 +149,7 @@ struct vgic_irq { bool enabled:1; bool active:1; bool hw:1; /* Tied to HW IRQ */ + bool on_lr:1; /* Present in a CPU LR */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ From 0dc433e79ad031801842e3d8bc5d9729e14f5067 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:59 +0000 Subject: [PATCH 187/260] KVM: arm64: Add LR overflow handling documentation Add a bit of documentation describing how we are dealing with LR overflow. This is mostly a braindump of how things are expected to work. For now anyway. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 81 +++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 6dd5a10081e2..7ee253a9fb77 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -825,7 +825,86 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, return count; } -/* Requires the VCPU's ap_list_lock to be held. */ +/* + * Dealing with LR overflow is close to black magic -- dress accordingly. + * + * We have to present an almost infinite number of interrupts through a very + * limited number of registers. Therefore crucial decisions must be made to + * ensure we feed the most relevant interrupts into the LRs, and yet have + * some facilities to let the guest interact with those that are not there. + * + * All considerations below are in the context of interrupts targeting a + * single vcpu with non-idle state (either pending, active, or both), + * colloquially called the ap_list: + * + * - Pending interrupts must have priority over active interrupts. This also + * excludes pending+active interrupts. This ensures that a guest can + * perform priority drops on any number of interrupts, and yet be + * presented the next pending one. + * + * - Deactivation of interrupts outside of the LRs must be tracked by using + * either the EOIcount-driven maintenance interrupt, and sometimes by + * trapping the DIR register. + * + * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the + * point that made it into the LRs, and deactivate interrupts that would + * have made it onto the LRs if we had the space. + * + * - The MI-generation bits must be used to try and force an exit when the + * guest has done enough changes to the LRs that we want to reevaluate the + * situation: + * + * - if the total number of pending interrupts exceeds the number of + * LR, NPIE must be set in order to exit once no pending interrupts + * are present in the LRs, allowing us to populate the next batch. + * + * - if there are active interrupts outside of the LRs, then LRENPIE + * must be set so that we exit on deactivation of one of these, and + * work out which one is to be deactivated. Note that this is not + * enough to deal with EOImode=1, see below. + * + * - if the overall number of interrupts exceeds the number of LRs, + * then UIE must be set to allow refilling of the LRs once the + * majority of them has been processed. + * + * - as usual, MI triggers are only an optimisation, since we cannot + * rely on the MI being delivered in timely manner... + * + * - EOImode=1 creates some additional problems: + * + * - deactivation can happen in any order, and we cannot rely on + * EOImode=0's coupling of priority-drop and deactivation which + * imposes strict reverse Ack order. This means that DIR must + * trap if we have active interrupts outside of the LRs. + * + * - deactivation of SPIs can occur on any CPU, while the SPI is only + * present in the ap_list of the CPU that actually ack-ed it. In that + * case, EOIcount doesn't provide enough information, and we must + * resort to trapping DIR even if we don't overflow the LRs. Bonus + * point for not trapping DIR when no SPIs are pending or active in + * the whole VM. + * + * - LPIs do not suffer the same problem as SPIs on deactivation, as we + * have to essentially discard the active state, see below. + * + * - Virtual LPIs have an active state (surprise!), which gets removed on + * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI + * is not present in the LR (surprise again!). Special care must therefore + * be taken to remove the active state from any activated LPI when exiting + * from the guest. This is in a way no different from what happens on the + * physical side. We still rely on the running priority to have been + * removed from the APRs, irrespective of the LPI being present in the LRs + * or not. + * + * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as + * they are not managed in SW and don't have a true active state. So only + * set vSGIEOICount when no SGIs are in the ap_list. + * + * - GICv2 SGIs with multiple sources are injected one source at a time, as + * if they were made pending sequentially. This may mean that we don't + * always present the HPPI if other interrupts with lower priority are + * pending in the LRs. Big deal. + */ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; From 73c9726975af1c2bf8d062017c67bcf4fb8821d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:00 +0000 Subject: [PATCH 188/260] KVM: arm64: GICv3: Drop LPI active state when folding LRs Despite LPIs not having an active state, *virtual* LPIs do have one, which gets cleared on EOI. So far, so good. However, this leads to a small problem: when an active LPI is not in the LRs, that EOImode==0 and that the guest EOIs it, EOIcount doesn't get bumped up. Which means that in these condition, the LPI would stay active forever. Clearly, we can't have that. So if we spot an active LPI, we drop that state. It's pretty pointless anyway, and only serves as a way to trip SW over. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index e3f4b27e0225..81d22f615fa6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -72,7 +72,9 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit, note deactivation */ + /* Always preserve the active bit for !LPIs, note deactivation */ + if (irq->intid >= VGIC_MIN_LPI) + val &= ~ICH_LR_ACTIVE_BIT; deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); From f4ded7b0848e6fcc9c882a1fdaa925d921c932f1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:01 +0000 Subject: [PATCH 189/260] KVM: arm64: GICv3: Preserve EOIcount on exit EOIcount is how the virtual CPU interface signals that the guest is deactivating interrupts outside of the LRs when EOImode==0. We therefore need to preserve that information so that we can find out what actually needs deactivating. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index e72d436dd6a3..9bfcbfd91118 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -225,6 +225,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + write_gicreg(0, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { From 00c6d0d4a80582a43578380f5283940c2e16eec8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:02 +0000 Subject: [PATCH 190/260] KVM: arm64: GICv3: Decouple ICH_HCR_EL2 programming from LRs Not programming ICH_HCR_EL2 while no LRs are populated is a bit of an issue, as we otherwise don't see any maintenance interrupt when the guest interacts with the LRs. Decouple the two and always program the control register, even when we don't have to touch the LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 9bfcbfd91118..2509b52bbd62 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -219,20 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } - if (used_lrs || cpu_if->its_vpe.its_vm) { + if (used_lrs) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); - if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { - u64 val = read_gicreg(ICH_HCR_EL2); - cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; - cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; - } - - write_gicreg(0, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; @@ -242,6 +234,14 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) __gic_v3_set_lr(0, i); } } + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) @@ -249,12 +249,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) u64 used_lrs = cpu_if->used_lrs; int i; - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Ensure that writes to the LRs, and on non-VHE systems ensure that From 438e47b697f7913bbb9f44e48b7b6e98389c9e0e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:03 +0000 Subject: [PATCH 191/260] KVM: arm64: GICv3: Extract LR folding primitive As we are going to need to handle deactivation for interrupts that are not in the LRs, split vgic_v3_fold_lr_state() into a helper that deals with a single interrupt, and the function that loops over the used LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-14-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 88 +++++++++++++++++------------------ 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 81d22f615fa6..6b7d7b4048f0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -33,78 +33,76 @@ static bool lr_signals_eoi_mi(u64 lr_val) !(lr_val & ICH_LR_HW); } -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; + struct vgic_irq *irq; + bool is_v2_sgi = false; + bool deactivated; + u32 intid; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } - cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; + irq = vgic_get_vcpu_irq(vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + return; - for (lr = 0; lr < cpuif->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - bool deactivated; - - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; - } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); - } - - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_vcpu_irq(vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; - - raw_spin_lock(&irq->irq_lock); + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit for !LPIs, note deactivation */ if (irq->intid >= VGIC_MIN_LPI) val &= ~ICH_LR_ACTIVE_BIT; deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; - /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { + (val & ICH_LR_PENDING_BIT)) irq->pending_latch = true; - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } - /* * Clear soft pending state when level irqs have been acked. */ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; + if (is_v2_sgi) { + u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val); + + if (irq->active) + irq->active_source = cpuid; + + if (val & ICH_LR_PENDING_BIT) + irq->source |= BIT(cpuid); + } + /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); irq->on_lr = false; - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); } + vgic_put_irq(vcpu->kvm, irq); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < cpuif->used_lrs; lr++) + vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); + cpuif->used_lrs = 0; } From 1ae0448ca7976281e7ec1d2cd1c861fbc8f8631e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:04 +0000 Subject: [PATCH 192/260] KVM: arm64: GICv3: Extract LR computing primitive Split vgic_v3_populate_lr() into two, so that we have another primitive that computes the LR from a vgic_irq, but doesn't update anything in the shadow structure. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-15-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 49 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6b7d7b4048f0..bcce7f35a6d6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -107,7 +107,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) } /* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; @@ -154,6 +154,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; + if (is_v2_sgi) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= ICH_LR_EOI; + } + } + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + return val; +} + +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = vgic_v3_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + if (val & ICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; @@ -161,16 +190,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= ICH_LR_EOI; - } } } @@ -183,13 +205,6 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; - irq->on_lr = true; } From 5ceb3dac80229684c8e57993f12106cbad23f7ac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:05 +0000 Subject: [PATCH 193/260] KVM: arm64: GICv2: Preserve EOIcount on exit EOIcount is how the virtual CPU interface signals that the guest is deactivating interrupts outside of the LRs when EOImode==0. We therefore need to preserve that information so that we can find out what actually needs deactivating, just like we already do on GICv3. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-16-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 74efacba38d4..5cfbe5898342 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -437,6 +437,12 @@ void vgic_v2_save_state(struct kvm_vcpu *vcpu) return; if (used_lrs) { + if (vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_EOICOUNT; + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= val & GICH_HCR_EOICOUNT; + } save_lrs(vcpu, base); writel_relaxed(0, base + GICH_HCR); } From a00c88ac1f90992e618cf4737e2d1c551c13aed6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:06 +0000 Subject: [PATCH 194/260] KVM: arm64: GICv2: Decouple GICH_HCR programming from LRs being loaded Not programming GICH_HCR while no LRs are populated is a bit of an issue, as we otherwise don't see any maintenance interrupt when the guest interacts with the LRs. Decouple the two and always program the control register, even when we don't have to touch the LRs. This is very similar to what we are already doing for GICv3. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-17-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 5cfbe5898342..a0d803c5b08a 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -430,22 +430,25 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; - if (used_lrs) { - if (vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr & GICH_HCR_LRENPIE) { - u32 val = readl_relaxed(base + GICH_HCR); - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_EOICOUNT; - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= val & GICH_HCR_EOICOUNT; - } + if (used_lrs) save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); + + if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT; + cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT; } + + writel_relaxed(0, base + GICH_HCR); } void vgic_v2_restore_state(struct kvm_vcpu *vcpu) @@ -458,13 +461,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) if (!base) return; - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + + for (i = 0; i < used_lrs; i++) + writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } void vgic_v2_load(struct kvm_vcpu *vcpu) From 3aa9a50c2007e4090b0b5b3c79aed6f63b5e6c49 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:07 +0000 Subject: [PATCH 195/260] KVM: arm64: GICv2: Extract LR folding primitive As we are going to need to handle deactivation for interrupts that are not in the LRs, split vgic_v2_fold_lr_state() into a helper that deals with a single interrupt, and the function that loops over the used LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-18-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 67 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index a0d803c5b08a..fb8efdd4196b 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -39,43 +39,23 @@ static bool lr_signals_eoi_mi(u32 lr_val) !(lr_val & GICH_LR_HW); } -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + bool deactivated; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + /* Extract the source vCPU id from the LR */ + cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7; - cpuif->vgic_hcr &= ~GICH_HCR_UIE; + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); - for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - bool deactivated; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; - - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_vcpu_irq(vcpu, intid); - - raw_spin_lock(&irq->irq_lock); + irq = vgic_get_vcpu_irq(vcpu, intid); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -102,11 +82,28 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); irq->on_lr = false; - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); } + vgic_put_irq(vcpu->kvm, irq); +} + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) + vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + cpuif->used_lrs = 0; } From 0660bc4a2b70e7158f63ea1777132d1c93188fe8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:08 +0000 Subject: [PATCH 196/260] KVM: arm64: GICv2: Extract LR computing primitive Split vgic_v2_populate_lr() into two helpers, so that we have another primitive that computes the LR from a vgic_irq, but doesn't update anything in the shadow structure. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-19-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 63 ++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index fb8efdd4196b..5a2165a8d22c 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -107,18 +107,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } -/* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. - */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; bool allow_pending = true; @@ -164,22 +153,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= GICH_LR_PENDING_BIT; + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= GICH_LR_EOI; + } + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + return val; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = vgic_v2_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + + if (val & GICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid)) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= GICH_LR_EOI; - } } } @@ -196,8 +215,6 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; irq->on_lr = true; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; } void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) From dd598fc1139f7181118719574a4e270e51e0a0eb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:09 +0000 Subject: [PATCH 197/260] KVM: arm64: Compute vgic state irrespective of the number of interrupts As we are going to rely on the [G]ICH_HCR{,_EL2} register to be programmed with MI information at all times, slightly de-optimise the flush/sync code to always be called. This is rather lightweight when no interrupts are in flight. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-20-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 7ee253a9fb77..39346baa2677 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -985,8 +985,6 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - int used_lrs; - /* If nesting, emulate the HW effect from L0 to L1 */ if (vgic_state_is_nested(vcpu)) { vgic_v3_sync_nested(vcpu); @@ -996,20 +994,10 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; - else - used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; - - if (used_lrs) - vgic_fold_lr_state(vcpu); + vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); } @@ -1053,29 +1041,10 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. - * - * Note that we still need to go through the whole thing if anything - * can be directly injected (GICv4). - */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && - !vgic_supports_direct_irqs(vcpu->kvm)) - return; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) vgic_flush_lr_state(vcpu); - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - } if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); From cf72ee63711916ad808f82eb054dd9d69727a5bf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:10 +0000 Subject: [PATCH 198/260] KVM: arm64: Eagerly save VMCR on exit We currently save/restore the VMCR register in a pretty lazy way (on load/put, consistently with what we do with the APRs). However, we are going to need the group-enable bits that are backed by VMCR on each entry (so that we can avoid injecting interrupts for disabled groups). Move the synchronisation from put to sync, which results in some minor churn in the nVHE hypercalls to simplify things. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-21-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/arm.c | 3 +-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 7 ++++--- arch/arm64/kvm/hyp/vgic-v3-sr.c | 15 +++------------ arch/arm64/kvm/vgic/vgic-v2.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 2 +- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 8 files changed, 13 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9da54d4ee49e..f8adbd535b4a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,7 +79,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, - __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index e6be1f5d0967..dbf16a9f6772 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -82,7 +82,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..733195ef183e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -659,8 +659,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { if (is_protected_kvm_enabled()) { - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, - &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3); kvm_call_hyp_nvhe(__pkvm_vcpu_put); } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 29430c031095..a7c689152f68 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -157,6 +157,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } @@ -464,11 +465,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); } static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) @@ -616,7 +617,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_reserve_vm), HANDLE_FUNC(__pkvm_unreserve_vm), diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 2509b52bbd62..cafbb41b4c33 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -235,6 +235,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { u64 val = read_gicreg(ICH_HCR_EL2); cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; @@ -332,10 +334,6 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - /* Only restore SRE if the host implements the GICv2 interface */ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); @@ -357,7 +355,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) write_gicreg(0, ICH_HCR_EL2); } -static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -518,13 +516,6 @@ static void __vgic_v3_write_vmcr(u32 vmcr) write_gicreg(vmcr, ICH_VMCR_EL2); } -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) -{ - __vgic_v3_save_aprs(cpu_if); - if (cpu_if->vgic_sre) - cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); -} - void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_compat_mode_enable(); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 5a2165a8d22c..07e93acafd04 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -451,6 +451,7 @@ void vgic_v2_save_state(struct kvm_vcpu *vcpu) if (!base) return; + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); if (used_lrs) save_lrs(vcpu, base); @@ -495,6 +496,5 @@ void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 1fc9e0780abe..1531e4907c65 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -340,7 +340,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) u64 val; int i; - __vgic_v3_save_vmcr_aprs(s_cpu_if); + __vgic_v3_save_aprs(s_cpu_if); __vgic_v3_deactivate_traps(s_cpu_if); __vgic_v3_save_state(s_cpu_if); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index bcce7f35a6d6..5b276e303aab 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -815,7 +815,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) } if (likely(!is_protected_kvm_enabled())) - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) From 6780a756044c396f59e98befed537dbba4a085db Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:11 +0000 Subject: [PATCH 199/260] KVM: arm64: Revamp vgic maintenance interrupt configuration We currently don't use the maintenance interrupt very much, apart from EOI on level interrupts, and for LR underflow in limited cases. However, as we are moving toward a setup where active interrupts can live outside of the LRs, we need to use the MIs in a more diverse set of cases. Add a new helper that produces a digest of the ap_list, and use that summary to set the various control bits as required. This slightly changes the way v2 SGIs are handled, as they used to count for more than one interrupt, but not anymore. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-22-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 12 ++++- arch/arm64/kvm/vgic/vgic-v3.c | 18 ++++++- arch/arm64/kvm/vgic/vgic.c | 91 ++++++++++++----------------------- arch/arm64/kvm/vgic/vgic.h | 19 +++++++- 4 files changed, 74 insertions(+), 66 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 07e93acafd04..f53bc5528897 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -26,11 +26,19 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - cpuif->vgic_hcr |= GICH_HCR_UIE; + cpuif->vgic_hcr = GICH_HCR_EN; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_UIE; } static bool lr_signals_eoi_mi(u32 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 5b276e303aab..81f1de9e3897 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -20,11 +20,25 @@ static bool common_trap; static bool dir_trap; static bool gicv4_enable; -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cpuif->vgic_hcr = ICH_HCR_EL2_En; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + + if (!als->nr_sgi) + cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; } static bool lr_signals_eoi_mi(u64 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 39346baa2677..7e6f02d48fff 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -791,38 +791,30 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) vgic_v3_clear_lr(vcpu, lr); } -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) +static void summarize_ap_list(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; lockdep_assert_held(&vgic_cpu->ap_list_lock); + *als = (typeof(*als)){}; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; + guard(raw_spinlock)(&irq->irq_lock); - raw_spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - raw_spin_unlock(&irq->irq_lock); + if (unlikely(vgic_target_oracle(irq) != vcpu)) + continue; - count += w; - *multi_sgi |= (w > 1); + if (!irq->active) + als->nr_pend++; + else + als->nr_act++; + + if (irq->intid < VGIC_NR_SGIS) + als->nr_sgi++; } - return count; } /* @@ -908,60 +900,39 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct ap_list_summary als; struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - int i = 0; + int count = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + summarize_ap_list(vcpu, &als); + + if (irqs_outside_lrs(&als)) vgic_sort_ap_list(vcpu); - count = 0; - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); + scoped_guard(raw_spinlock, &irq->irq_lock) { + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + } + } - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - raw_spin_unlock(&irq->irq_lock); + if (count == kvm_vgic_global_state.nr_lr) break; - } - - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; - } - - raw_spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); - break; - } } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++) vgic_clear_lr(vcpu, i); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; - else + vgic_v2_configure_hcr(vcpu, &als); + } else { vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; + vgic_v3_configure_hcr(vcpu, &als); + } } static inline bool can_access_vgic_from_kernel(void) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0ecadfa00397..4a0733869cb5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -236,6 +236,21 @@ struct its_ite { u32 event_id; }; +struct ap_list_summary { + unsigned int nr_pend; /* purely pending, not active */ + unsigned int nr_act; /* active, or active+pending */ + unsigned int nr_sgi; /* any SGI */ +}; + +#define irqs_outside_lrs(s) \ + (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr) + +#define irqs_pending_outside_lrs(s) \ + ((s)->nr_pend > kvm_vgic_global_state.nr_lr) + +#define irqs_active_outside_lrs(s) \ + ((s)->nr_act && irqs_outside_lrs(s)) + int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, @@ -262,7 +277,7 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -302,7 +317,7 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq) void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); From f04b8a5a83dbaff310ff919190123db238d35952 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:12 +0000 Subject: [PATCH 200/260] KVM: arm64: Turn kvm_vgic_vcpu_enable() into kvm_vgic_vcpu_reset() Now that we always reconfigure the vgic HCR register on entry, the "enable" part of kvm_vgic_vcpu_enable() is pretty useless. Removing the enable bits from these functions makes it plain that they are just about computing the reset state. Just rename the functions accordingly. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-23-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 8 ++++---- arch/arm64/kvm/vgic/vgic-v2.c | 5 +---- arch/arm64/kvm/vgic/vgic-v3.c | 5 +---- arch/arm64/kvm/vgic/vgic.h | 4 ++-- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1796b1a22a72..6d5e5d708f23 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -353,12 +353,12 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) return ret; } -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) { if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); + vgic_v2_reset(vcpu); else - vgic_v3_enable(vcpu); + vgic_v3_reset(vcpu); } /* @@ -405,7 +405,7 @@ int vgic_init(struct kvm *kvm) } kvm_for_each_vcpu(idx, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); + kvm_vgic_vcpu_reset(vcpu); ret = kvm_vgic_setup_default_irq_routing(kvm); if (ret) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index f53bc5528897..18856186be7b 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -285,7 +285,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; } -void vgic_v2_enable(struct kvm_vcpu *vcpu) +void vgic_v2_reset(struct kvm_vcpu *vcpu) { /* * By forcing VMCR to zero, the GIC will restore the binary @@ -293,9 +293,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; } /* check for overlapping regions and for regions crossing the end of memory */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 81f1de9e3897..780cc92c79e0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -293,7 +293,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) -void vgic_v3_enable(struct kvm_vcpu *vcpu) +void vgic_v3_reset(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -323,9 +323,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) kvm_vgic_global_state.ich_vtr_el2); vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, kvm_vgic_global_state.ich_vtr_el2) + 1; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 4a0733869cb5..e48294521541 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -285,7 +285,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); +void vgic_v2_reset(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, @@ -320,7 +320,7 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); +void vgic_v3_reset(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); From 76b2eda65cccb452a2d112809a2995ee7533f963 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:13 +0000 Subject: [PATCH 201/260] KVM: arm64: Make vgic_target_oracle() globally available Make the internal crystal ball global, so that implementation-specific code can use it. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-24-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 2 +- arch/arm64/kvm/vgic/vgic.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 7e6f02d48fff..004010104659 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -237,7 +237,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) * * Requires the IRQ lock to be held. */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { lockdep_assert_held(&irq->irq_lock); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index e48294521541..037efb620082 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -261,6 +261,7 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); From 05984ba67eb6fe554afb355368a037f9eec1dd43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:14 +0000 Subject: [PATCH 202/260] KVM: arm64: Invert ap_list sorting to push active interrupts out Having established that pending interrupts should have priority to be moved into the LRs over the active interrupts, implement this in the ap_list sorting. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-25-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 004010104659..c7a5454ac4c9 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -270,10 +270,7 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) * well, the first items in the list being the first things populated in the * LRs. * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * + * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. * @@ -298,21 +295,21 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; + pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; + + ret = (int)pendb - (int)penda; + if (ret) goto out; - } - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); - - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; + /* Both pending and enabled, sort by priority (lower number first) */ + ret = (int)irqa->priority - (int)irqb->priority; + if (ret) goto out; - } - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; + /* Finally, HW bit active interrupts have priority over non-HW ones */ + ret = (int)irqb->hw - (int)irqa->hw; + out: raw_spin_unlock(&irqb->irq_lock); raw_spin_unlock(&irqa->irq_lock); From 33c1f60b3213c766f434f1be1988d8b211b106a9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:15 +0000 Subject: [PATCH 203/260] KVM: arm64: Move undeliverable interrupts to the end of ap_list Interrupts in the ap_list that cannot be acted upon because they are not enabled, or that their group is not enabled, shouldn't make it into the LRs if we are space-constrained. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-26-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index c7a5454ac4c9..abe01c9c6b36 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -265,6 +265,11 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) return NULL; } +struct vgic_sort_info { + struct kvm_vcpu *vcpu; + struct vgic_vmcr vmcr; +}; + /* * The order of items in the ap_lists defines how we'll pack things in LRs as * well, the first items in the list being the first things populated in the @@ -273,6 +278,7 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. + * Interrupts that are not deliverable should be at the end of the list. * * Return negative if "a" sorts before "b", 0 to preserve order, and positive * to sort "b" before "a". @@ -282,6 +288,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, { struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + struct vgic_sort_info *info = priv; + struct kvm_vcpu *vcpu = info->vcpu; bool penda, pendb; int ret; @@ -295,6 +303,17 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + /* Undeliverable interrupts should be last */ + ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu); + if (ret) + goto out; + + /* Same thing for interrupts targeting a disabled group */ + ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + if (ret) + goto out; + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; @@ -320,10 +339,12 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_sort_info info = { .vcpu = vcpu, }; lockdep_assert_held(&vgic_cpu->ap_list_lock); - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); + vgic_get_vmcr(vcpu, &info.vmcr); + list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp); } /* From a69e2d6f8934bdb9d08a6740ca6c7a44525e2e95 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:16 +0000 Subject: [PATCH 204/260] KVM: arm64: Use MI to detect groups being enabled/disabled Add the maintenance interrupt to force an exit when the guest enables/disables individual groups, so that we can resort the ap_list accordingly. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-27-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 5 +++++ arch/arm64/kvm/vgic/vgic-v3.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 18856186be7b..9a2de03f74c3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -39,6 +39,11 @@ void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, cpuif->vgic_hcr |= GICH_HCR_LRENPIE; if (irqs_outside_lrs(als)) cpuif->vgic_hcr |= GICH_HCR_UIE; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ? + GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ? + GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE; } static bool lr_signals_eoi_mi(u32 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 780cc92c79e0..312226cc2565 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -39,6 +39,11 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, if (!als->nr_sgi) cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; } static bool lr_signals_eoi_mi(u64 lr_val) From 3cfd59f81e0f3fbdf8a1b2f576bdc63ab6cc3277 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:17 +0000 Subject: [PATCH 205/260] KVM: arm64: GICv3: Handle LR overflow when EOImode==0 Now that we can identify interrupts that have not made it into the LRs, it becomes relatively easy to use EOIcount to walk the overflow list. What is a bit odd is that we compute a fake LR for the original state of the interrupt, clear the active bit, and feed into the existing logic for processing. In a way, this is what would have happened if the interrupt was in an LR. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-28-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 312226cc2565..d4f27f451c8f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -112,16 +112,62 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) vgic_put_irq(vcpu->kvm, irq); } +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +static void vgic_v3_deactivate_phys(u32 intid) +{ + if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI); + else + gic_write_dir(intid); +} + void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); for (int lr = 0; lr < cpuif->used_lrs; lr++) vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); + /* + * EOIMode=0: use EOIcount to emulate deactivation. We are + * guaranteed to deactivate in reverse order of the activation, so + * just pick one active interrupt after the other in the ap_list, + * and replay the deactivation as if the CPU was doing it. We also + * rely on priority drop to have taken place, and the list to be + * sorted by priority. + */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u64 lr; + + /* + * I would have loved to write this using a scoped_guard(), + * but using 'continue' here is a total train wreck. + */ + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + eoicount--; + } + cpuif->used_lrs = 0; } From cd4f6ee99b28f10692c2444c8dc0bab77357a25e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:18 +0000 Subject: [PATCH 206/260] KVM: arm64: GICv3: Handle deactivation via ICV_DIR_EL1 traps Deactivation via ICV_DIR_EL1 is both relatively straightforward (we have the interrupt that needs deactivation) and really awkward. The main issue is that the interrupt may either be in an LR on another CPU, or ourside of any LR. In the former case, we process the deactivation is if ot was a write to GICD_CACTIVERn, which is already implemented as a big hammer IPI'ing all vcpus. In the latter case, we just perform a normal deactivation, similar to what we do for EOImode==0. Another annoying aspect is that we need to tell the CPU owning the interrupt that its ap_list needs laudering. We use a brand new vcpu request to that effect. Note that this doesn't address deactivation via the GICV MMIO view, which will be taken care of in a later change. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-29-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 4 ++ arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 ++ arch/arm64/kvm/sys_regs.c | 19 ++++++- arch/arm64/kvm/vgic/vgic-v3.c | 85 +++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.c | 11 ++++ arch/arm64/kvm/vgic/vgic.h | 1 + include/kvm/arm_vgic.h | 1 + 8 files changed, 123 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 64302c438355..7501a2ee4dd4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -54,6 +54,7 @@ #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) #define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) +#define KVM_REQ_VGIC_PROCESS_UPDATE KVM_ARCH_REQ(11) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 733195ef183e..fe13f9777f9c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1041,6 +1041,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + /* Process interrupts deactivated through a trap */ + if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu)) + kvm_vgic_process_async_update(vcpu); + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index cafbb41b4c33..f2f585455144 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -1247,6 +1247,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; + /* Full exit if required to handle overflow deactivation... */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) + return 0; fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..1b69d6e2d720 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, return true; } +static bool access_gic_dir(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return undef_access(vcpu, p, r); + + vgic_v3_deactivate(vcpu, p->regval); + + return true; +} + static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -3370,7 +3385,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, @@ -4495,7 +4510,7 @@ static const struct sys_reg_desc cp15_regs[] = { { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d4f27f451c8f..d83edf02d072 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -12,6 +12,7 @@ #include #include +#include "vgic-mmio.h" #include "vgic.h" static bool group0_trap; @@ -171,6 +172,90 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + struct kvm_vcpu *target_vcpu = NULL; + struct vgic_irq *irq; + unsigned long flags; + bool mmio = false; + u64 lr = 0; + + /* + * We only deal with DIR when EOIMode==1, and only for SGI, + * PPI or SPI. + */ + if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* + * EOIMode=1: we must rely on traps to handle deactivate of + * overflowing interrupts, as there is no ordering guarantee and + * EOIcount isn't being incremented. Priority drop will have taken + * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs. + * + * Three possibities: + * + * - The irq is not queued on any CPU, and there is nothing to + * do, + * + * - Or the irq is in an LR, meaning that its state is not + * directly observable. Treat it bluntly by making it as if + * this was a write to GICD_ICACTIVER, which will force an + * exit on all vcpus. If it hurts, don't do that. + * + * - Or the irq is active, but not in an LR, and we can + * directly deactivate it by building a pseudo-LR, fold it, + * and queue a request to prune the resulting ap_list, + */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + /* Requires the irq to be locked already */ static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index abe01c9c6b36..cbba6c2988d1 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -990,6 +990,17 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) vgic_prune_ap_list(vcpu); } +/* Sync interrupts that were deactivated through a DIR trap */ +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + vgic_prune_ap_list(vcpu); + local_irq_restore(flags); +} + static inline void vgic_restore_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 037efb620082..01ff6d4aa9da 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -318,6 +318,7 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq) void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val); void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index ec349c5a4a8b..b798546755a3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -421,6 +421,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); From 295b69216558367e5e833eb6d92ab2b476a8ad64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:19 +0000 Subject: [PATCH 207/260] KVM: arm64: GICv3: Add GICv2 SGI handling to deactivation primitive The GICv2 SGIs require additional handling for deactivation, as they are effectively multiple interrrupts muxed into one. Make sure we check for the source CPU when deactivating. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-30-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d83edf02d072..9fcee5121fe5 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -176,11 +176,20 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false, is_v2_sgi; struct vgic_irq *irq; unsigned long flags; - bool mmio = false; u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + val < VGIC_NR_SGIS); /* * We only deal with DIR when EOIMode==1, and only for SGI, @@ -216,6 +225,9 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) * - Or the irq is active, but not in an LR, and we can * directly deactivate it by building a pseudo-LR, fold it, * and queue a request to prune the resulting ap_list, + * + * Special care must be taken to match the source CPUID when + * deactivating a GICv2 SGI. */ scoped_guard(raw_spinlock, &irq->irq_lock) { target_vcpu = irq->vcpu; @@ -233,6 +245,12 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) goto put; } + /* GICv2 SGI: check that the cpuid matches */ + if (is_v2_sgi && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + /* (with a Dalek voice) DEACTIVATE!!!! */ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } From 70fd60bdedc9ff4c4830a8b379fb65e6ba1e819f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:20 +0000 Subject: [PATCH 208/260] KVM: arm64: GICv3: Set ICH_HCR_EL2.TDIR when interrupts overflow LR capacity Now that we are ready to handle deactivation through ICV_DIR_EL1, set the trap bit if we have active interrupts outside of the LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-31-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9fcee5121fe5..09f86bf6fe7b 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -45,6 +45,13 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; + + /* + * Note that we set the trap irrespective of EOIMode, as that + * can change behind our back without any warning... + */ + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } static bool lr_signals_eoi_mi(u64 lr_val) From 1c3b3cadcd69f7415e8b3b1b1e81459e0e8c9f33 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:21 +0000 Subject: [PATCH 209/260] KVM: arm64: GICv3: Add SPI tracking to handle asymmetric deactivation SPIs are specially annpying, as they can be activated on a CPU and deactivated on another. WHich means that when an SPI is in flight anywhere, all CPUs need to have their TDIR trap bit set. This translates into broadcasting an IPI across all CPUs to make sure they set their trap bit, The number of in-flight SPIs is kept in an atomic variable so that CPUs can turn the trap bit off as soon as possible. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-32-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 1 + arch/arm64/kvm/vgic/vgic-v3.c | 21 +++++++++++++++------ arch/arm64/kvm/vgic/vgic.c | 25 +++++++++++++++++++++++-- include/kvm/arm_vgic.h | 3 +++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 6d5e5d708f23..52de99c0f01c 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -188,6 +188,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + dist->active_spis = (atomic_t)ATOMIC_INIT(0); dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 09f86bf6fe7b..55847fbad4d0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -47,10 +47,17 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; /* + * Dealing with EOImode=1 is a massive source of headache. Not + * only do we need to track that we have active interrupts + * outside of the LRs and force DIR to be trapped, we also + * need to deal with SPIs that can be deactivated on another + * CPU. + * * Note that we set the trap irrespective of EOIMode, as that * can change behind our back without any warning... */ - if (irqs_active_outside_lrs(als)) + if (irqs_active_outside_lrs(als) || + atomic_read(&vcpu->kvm->arch.vgic.active_spis)) cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } @@ -78,11 +85,6 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) if (!irq) /* An LPI could have been unmapped. */ return; - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit for !LPIs, note deactivation */ if (irq->intid >= VGIC_MIN_LPI) @@ -117,6 +119,13 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) irq->on_lr = false; } + /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */ + if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) { + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis); + } + vgic_put_irq(vcpu->kvm, irq); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index cbba6c2988d1..83969c18ef03 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -367,6 +367,17 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne return false; } +static bool vgic_model_needs_bcst_kick(struct kvm *kvm) +{ + /* + * A GICv3 (or GICv3-like) system exposing a GICv3 to the + * guest needs a broadcast kick to set TDIR globally, even if + * the bit doesn't really exist (we still need to check for + * the shadow bit in the DIR emulation fast-path). + */ + return (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + /* * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. * Do the queuing if necessary, taking the right locks in the right order. @@ -379,6 +390,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; + bool bcast; lockdep_assert_held(&irq->irq_lock); @@ -453,11 +465,20 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; + /* A new SPI may result in deactivation trapping on all vcpus */ + bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) && + vgic_valid_spi(vcpu->kvm, irq->intid) && + atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0); + raw_spin_unlock(&irq->irq_lock); raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + if (!bcast) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } else { + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING); + } return true; } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b798546755a3..6a4d3d205596 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,6 +263,9 @@ struct vgic_dist { /* The GIC maintenance IRQ for nested hypervisors. */ u32 mi_intid; + /* Track the number of in-flight active SPIs */ + atomic_t active_spis; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { From ca3c34da3644a24daf248be5dba72783c338dad4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:22 +0000 Subject: [PATCH 210/260] KVM: arm64: GICv3: Handle in-LR deactivation when possible Even when we have either an LR overflow or SPIs in flight, it is extremely likely that the interrupt being deactivated is still in the LRs, and that going all the way back to the the generic trap handling code is a waste of time. Instead, try and deactivate in place when possible, and only if this fails, perform a full exit. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-33-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 38 ++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index f2f585455144..71199e1a9294 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -792,7 +792,7 @@ static void __vgic_v3_bump_eoicount(void) write_gicreg(hcr, ICH_HCR_EL2); } -static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; @@ -800,19 +800,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; + return 1; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) - return; + return 1; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; } - __vgic_v3_clear_active_lr(lr, lr_val); + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) @@ -1247,9 +1253,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; - /* Full exit if required to handle overflow deactivation... */ - if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) - return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: From 84792050e0392fbc1f285f9d9a0266b8480f6f06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:23 +0000 Subject: [PATCH 211/260] KVM: arm64: GICv3: Avoid broadcast kick on CPUs lacking TDIR CPUs lacking TDIR always trap ICV_DIR_EL1, no matter what, since we have ICH_HCR_EL2.TC set permanently. For these CPUs, it is useless to use a broadcast kick on SPI injection, as the sole purpose of this is to set TDIR. We can therefore skip this on these CPUs, which are challenged enough not to be burdened by extra IPIs. As a consequence, permanently set the TDIR bit in the shadow state to notify the fast-path emulation code of the exit reason. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-34-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 6 +++++- arch/arm64/kvm/vgic/vgic.c | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 55847fbad4d0..968aa9d89be6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -53,10 +53,14 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, * need to deal with SPIs that can be deactivated on another * CPU. * + * On systems that do not implement TDIR, force the bit in the + * shadow state anyway to avoid IPI-ing on these poor sods. + * * Note that we set the trap irrespective of EOIMode, as that * can change behind our back without any warning... */ - if (irqs_active_outside_lrs(als) || + if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) || + irqs_active_outside_lrs(als) || atomic_read(&vcpu->kvm->arch.vgic.active_spis)) cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 83969c18ef03..693ec005c996 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -370,12 +370,15 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne static bool vgic_model_needs_bcst_kick(struct kvm *kvm) { /* - * A GICv3 (or GICv3-like) system exposing a GICv3 to the - * guest needs a broadcast kick to set TDIR globally, even if - * the bit doesn't really exist (we still need to check for - * the shadow bit in the DIR emulation fast-path). + * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest + * needs a broadcast kick to set TDIR globally. + * + * For systems that do not have TDIR (ARM's own v8.0 CPUs), the + * shadow TDIR bit is always set, and so is the register's TC bit, + * so no need to kick the CPUs. */ - return (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); + return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); } /* From eb33ffa2bd3f1842d2960aff7484869fc64aa2fb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:24 +0000 Subject: [PATCH 212/260] KVM: arm64: GICv3: nv: Resync LRs/VMCR/HCR early for better MI emulation The current approach to nested GICv3 support is to not do anything while L2 is running, wait a transition from L2 to L1 to resync LRs, VMCR and HCR, and only then evaluate the state to decide whether to generate a maintenance interrupt. This doesn't provide a good quality of emulation, and it would be far preferable to find out early that we need to perform a switch. Move the LRs/VMCR and HCR resync into vgic_v3_sync_nested(), so that we have most of the state available. As we turning the vgic off at this stage to avoid a screaming host MI, add a new helper vgic_v3_flush_nested() that switches the vgic on again. The MI can then be directly injected as required. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-35-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 69 ++++++++++++++++------------ arch/arm64/kvm/vgic/vgic.c | 6 ++- arch/arm64/kvm/vgic/vgic.h | 1 + 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index dbf16a9f6772..76ce2b94bd97 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -77,6 +77,7 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); u64 __gic_v3_get_lr(unsigned int lr); +void __gic_v3_set_lr(u64 val, int lr); void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 71199e1a9294..99342c13e179 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -60,7 +60,7 @@ u64 __gic_v3_get_lr(unsigned int lr) unreachable(); } -static void __gic_v3_set_lr(u64 val, int lr) +void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 1531e4907c65..40f7a37e0685 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -70,13 +70,14 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * - on L2 put: perform the inverse transformation, so that the result of L2 * running becomes visible to L1 in the VNCR-accessible registers. * - * - there is nothing to do on L2 entry, as everything will have happened - * on load. However, this is the point where we detect that an interrupt - * targeting L1 and prepare the grand switcheroo. + * - there is nothing to do on L2 entry apart from enabling the vgic, as + * everything will have happened on load. However, this is the point where + * we detect that an interrupt targeting L1 and prepare the grand + * switcheroo. * - * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 - * interrupt. The L0 active state will be cleared by the HW if the L1 - * interrupt was itself backed by a HW interrupt. + * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate + * corresponding the L1 interrupt. The L0 active state will be cleared by + * the HW if the L1 interrupt was itself backed by a HW interrupt. * * Maintenance Interrupt (MI) management: * @@ -265,15 +266,30 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); +} + void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + u64 val, host_lr, lr; struct vgic_irq *irq; + host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); + + /* Propagate the new LR state */ + lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = lr & ~ICH_LR_STATE; + val |= host_lr & ICH_LR_STATE; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); + if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) continue; @@ -286,12 +302,21 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ continue; - lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); - if (!(lr & ICH_LR_STATE)) + if (!(host_lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); } + + /* We need these to be synchronised to generate the MI */ + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); + + write_sysreg_s(0, SYS_ICH_HCR_EL2); + isb(); + + vgic_v3_nested_update_mi(vcpu); } static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, @@ -324,7 +349,8 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu) __vgic_v3_restore_vmcr_aprs(cpu_if); __vgic_v3_activate_traps(cpu_if); - __vgic_v3_restore_state(cpu_if); + for (int i = 0; i < cpu_if->used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Propagate the number of used LRs for the benefit of the HYP @@ -337,36 +363,19 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; - u64 val; int i; __vgic_v3_save_aprs(s_cpu_if); - __vgic_v3_deactivate_traps(s_cpu_if); - __vgic_v3_save_state(s_cpu_if); - - /* - * Translate the shadow state HW fields back to the virtual ones - * before copying the shadow struct back to the nested one. - */ - val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); - val &= ~ICH_HCR_EL2_EOIcount_MASK; - val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); - __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); - __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); for (i = 0; i < 4; i++) { __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); } - for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + for (i = 0; i < s_cpu_if->used_lrs; i++) + __gic_v3_set_lr(0, i); - val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; - - __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - } + __vgic_v3_deactivate_traps(s_cpu_if); vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 693ec005c996..892595fdbbff 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -1049,8 +1049,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) * abort the entry procedure and inject the exception at the * beginning of the run loop. * - * - Otherwise, do exactly *NOTHING*. The guest state is - * already loaded, and we can carry on with running it. + * - Otherwise, do exactly *NOTHING* apart from enabling the virtual + * CPU interface. The guest state is already loaded, and we can + * carry on with running it. * * If we have NV, but are not in a nested state, compute the * maintenance interrupt state, as it may fire. @@ -1059,6 +1060,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (kvm_vgic_vcpu_pending_irq(vcpu)) kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + vgic_v3_flush_nested(vcpu); return; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 01ff6d4aa9da..e93bdb485f07 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -445,6 +445,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); From 6dd333c8942b2e5bb5927af843b56ec2857db7c7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:25 +0000 Subject: [PATCH 213/260] KVM: arm64: GICv3: nv: Plug L1 LR sync into deactivation primitive Pretty much like the rest of the LR handling, deactivation of an L2 interrupt gets reflected in the L1 LRs, and therefore must be propagated into the L1 shadow state if the interrupt is HW-bound. Instead of directly handling the active state (which looks a bit off as it ignores locking and L1->L0 HW propagation), use the new deactivation primitive to perform the deactivation and deal with the required maintenance. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-36-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 40f7a37e0685..15e7033a7937 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -280,7 +280,6 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 val, host_lr, lr; - struct vgic_irq *irq; host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); @@ -290,7 +289,14 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) val |= host_lr & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) + /* + * Deactivation of a HW interrupt: the LR must have the HW + * bit set, have been in a non-invalid state before the run, + * and now be in an invalid state. If any of that doesn't + * hold, we're done with this LR. + */ + if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && + !(host_lr & ICH_LR_STATE))) continue; /* @@ -298,14 +304,7 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) * need to emulate the HW effect between the guest hypervisor * and the nested guest. */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - continue; - - if (!(host_lr & ICH_LR_STATE)) - irq->active = false; - - vgic_put_irq(vcpu->kvm, irq); + vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); } /* We need these to be synchronised to generate the MI */ From 78ffc28456f5981f0e54007fe124e20610abd0ea Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:26 +0000 Subject: [PATCH 214/260] KVM: arm64: GICv3: Force exit to sync ICH_HCR_EL2.En FEAT_NV2 is pretty terrible for anything that tries to enforce immediate effects, and writing to ICH_HCR_EL2 in the hope to disable a maintenance interrupt is vain. This only hits memory, and the guest hasn't cleared anything -- the MI will fire. For example, running the vgic_irq test under NV results in about 800 maintenance interrupts being actually handled by the L1 guest, when none were expected. As a cheap workaround, read back ICH_MISR_EL2 after writing 0 to ICH_HCR_EL2. This is very cheap on real HW, and causes a trap to the host in NV, giving it the opportunity to retire the pending MI. With this, the above test runs to completion without any MI being actually handled. Yes, this is really poor... Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-37-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 7 +++++++ arch/arm64/kvm/vgic/vgic-v3-nested.c | 6 ++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 99342c13e179..0b670a033fd8 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -244,6 +244,13 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 15e7033a7937..61b44f3f2bf1 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -94,8 +94,10 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * * - because most of the ICH_*_EL2 registers live in the VNCR page, the * quality of emulation is poor: L1 can setup the vgic so that an MI would - * immediately fire, and not observe anything until the next exit. Trying - * to read ICH_MISR_EL2 would do the trick, for example. + * immediately fire, and not observe anything until the next exit. + * Similarly, a pending MI is not immediately disabled by clearing + * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for + * example. * * System register emulation: * From 281c6c06e2a7bc331cbe02ad21f1390820d28d59 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:27 +0000 Subject: [PATCH 215/260] KVM: arm64: GICv2: Handle LR overflow when EOImode==0 Similarly to the GICv3 version, handle the EOIcount-driven deactivation by walking the overflow list. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-38-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 9a2de03f74c3..bbd4d003fde8 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -100,6 +100,8 @@ static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) vgic_put_irq(vcpu->kvm, irq); } +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + /* * transfer the content of the LRs back into the corresponding ap_list: * - active bit is transferred as is @@ -111,12 +113,37 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + /* See the GICv3 equivalent for the EOIcount handling rationale */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u32 lr; + + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + vgic_v2_fold_lr(vcpu, lr); + eoicount--; + } + cpuif->used_lrs = 0; } From 255de897e7fb918a34845167c572b5bf8e1d9d79 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:28 +0000 Subject: [PATCH 216/260] KVM: arm64: GICv2: Handle deactivation via GICV_DIR traps Add the plumbing of GICv2 interrupt deactivation via GICV_DIR. This requires adding a new device so that we can easily decode the DIR address. The deactivation itself is very similar to the GICv3 version. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-39-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-mmio-v2.c | 24 +++++++++ arch/arm64/kvm/vgic/vgic-mmio.h | 1 + arch/arm64/kvm/vgic/vgic-v2.c | 85 ++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.h | 1 + include/kvm/arm_vgic.h | 1 + 5 files changed, 112 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index f25fccb1f8e6..406845b3117c 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, vgic_set_vmcr(vcpu, &vmcr); } +static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_deactivate(vcpu, val); + else + vgic_v3_deactivate(vcpu, val); +} + static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = { REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE, + vgic_mmio_read_raz, vgic_mmio_write_dir, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, + 4, VGIC_ACCESS_32bit), }; unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) @@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) return SZ_4K; } +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_cpu_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return KVM_VGIC_V2_CPU_SIZE; +} + int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 5b490a4dfa5e..50dc80220b0f 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index bbd4d003fde8..bc52d44a573d 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -9,6 +9,7 @@ #include #include +#include "vgic-mmio.h" #include "vgic.h" static inline void vgic_v2_write_lr(int lr, u32 val) @@ -147,6 +148,79 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + /* We only deal with DIR when EOIMode==1 */ + if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK)) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* See the corresponding v3 code for the rationale */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* SGI: check that the cpuid matches */ + if (val < VGIC_NR_SGIS && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + + vgic_v2_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; @@ -346,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) int vgic_v2_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int len; int ret = 0; if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || @@ -369,6 +444,16 @@ int vgic_v2_map_resources(struct kvm *kvm) return ret; } + len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev); + dist->cpuif_iodev.base_addr = dist->vgic_cpu_base; + dist->cpuif_iodev.iodev_type = IODEV_CPUIF; + dist->cpuif_iodev.redist_vcpu = NULL; + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base, + len, &dist->cpuif_iodev.dev); + if (ret) + return ret; + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index e93bdb485f07..5f0fc96b4dc2 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -277,6 +277,7 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 6a4d3d205596..b261fb3968d0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -287,6 +287,7 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; + struct vgic_io_device cpuif_iodev; bool has_its; bool table_write_in_progress; From 07bb1c5622a54e2fd3f5c5a86969a2e7ad7f7376 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:29 +0000 Subject: [PATCH 217/260] KVM: arm64: GICv2: Always trap GICV_DIR register Since we can't decide to trap the DIR register on a per-vcpu basis, always trap the second page of the GIC CPU interface. Yes, this is costly. On the bright side, no sane SW should use EOImode==1 on GICv2... Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-40-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 4 ++++ arch/arm64/kvm/vgic/vgic-v2.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 78579b31a420..5fd99763b54d 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return -1; } + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index bc52d44a573d..585491fbda80 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -457,7 +457,7 @@ int vgic_v2_map_resources(struct kvm *kvm) if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); + KVM_VGIC_V2_CPU_SIZE - SZ_4K, true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); return ret; From a1650de7c160aace941d27d39b60c38f6f795aa1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:30 +0000 Subject: [PATCH 218/260] KVM: arm64: selftests: gic_v3: Add irq group setting helper Being able to set the group of an interrupt is pretty useful. Add such a helper. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-41-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/include/arm64/gic.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic.c | 6 ++++++ .../testing/selftests/kvm/lib/arm64/gic_private.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 15 +++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h index baeb3c859389..cc7a7f34ed37 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic.h +++ b/tools/testing/selftests/kvm/include/arm64/gic.h @@ -57,6 +57,7 @@ void gic_irq_set_pending(unsigned int intid); void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_irq_set_group(unsigned int intid, bool group); void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, vm_paddr_t pend_table); diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c index 7abbf8866512..b023868fe0b8 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic.c @@ -155,3 +155,9 @@ void gic_irq_set_config(unsigned int intid, bool is_edge) GUEST_ASSERT(gic_common_ops); gic_common_ops->gic_irq_set_config(intid, is_edge); } + +void gic_irq_set_group(unsigned int intid, bool group) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_group(intid, group); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h index d24e9ecc96c6..b6a7e30c3eb1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h @@ -25,6 +25,7 @@ struct gic_common_ops { void (*gic_irq_clear_pending)(uint32_t intid); bool (*gic_irq_get_pending)(uint32_t intid); void (*gic_irq_set_config)(uint32_t intid, bool is_edge); + void (*gic_irq_set_group)(uint32_t intid, bool group); }; extern const struct gic_common_ops gicv3_ops; diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..3e4e1a6a4f7c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -293,6 +293,20 @@ static void gicv3_enable_redist(volatile void *redist_base) } } +static void gicv3_set_group(uint32_t intid, bool grp) +{ + uint32_t cpu_or_dist; + uint32_t val; + + cpu_or_dist = (get_intid_range(intid) == SPI_RANGE) ? DIST_BIT : guest_get_vcpuid(); + val = gicv3_reg_readl(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4); + if (grp) + val |= BIT(intid % 32); + else + val &= ~BIT(intid % 32); + gicv3_reg_writel(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4, val); +} + static void gicv3_cpu_init(unsigned int cpu) { volatile void *sgi_base; @@ -400,6 +414,7 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_clear_pending = gicv3_irq_clear_pending, .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, + .gic_irq_set_group = gicv3_set_group, }; void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, From 2366295c76c2e09b969b4a5a0829d750bb1ab062 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:31 +0000 Subject: [PATCH 219/260] KVM: arm64: selftests: gic_v3: Disable Group-0 interrupts by default Make sure G0 is disabled at the point of initialising the GIC. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-42-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 3e4e1a6a4f7c..5b0fd95c6b48 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -342,6 +342,8 @@ static void gicv3_cpu_init(unsigned int cpu) /* Set a default priority threshold */ write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + /* Disable Group-0 interrupts */ + write_sysreg_s(ICC_IGRPEN0_EL1_MASK, SYS_ICC_IGRPEN1_EL1); /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } From 27392612c8823f4b65240949eb0dc77de946285d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:32 +0000 Subject: [PATCH 220/260] KVM: arm64: selftests: vgic_irq: Fix GUEST_ASSERT_IAR_EMPTY() helper No, 0 is not a spurious INTID. Never been, never was. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-43-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..a77562b2976a 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -205,7 +205,7 @@ static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, do { \ uint32_t _intid; \ _intid = gic_get_and_ack_irq(); \ - GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ + GUEST_ASSERT(_intid == IAR_SPURIOUS); \ } while (0) #define CAT_HELPER(a, b) a ## b From 8b7888c5114d280b071f341c072775ee222178b1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:33 +0000 Subject: [PATCH 221/260] KVM: arm64: selftests: vgic_irq: Change configuration before enabling interrupt The architecture is pretty clear that changing the configuration of an enable interrupt is not OK. It doesn't really matter here, but doing the right thing is not more expensive. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-44-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a77562b2976a..a8919ef3cea2 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -473,12 +473,12 @@ static void guest_code(struct test_args *args) gic_init(GIC_V3, 1); - for (i = 0; i < nr_irqs; i++) - gic_irq_enable(i); - for (i = MIN_SPI; i < nr_irqs; i++) gic_irq_set_config(i, !level_sensitive); + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + gic_set_eoi_split(args->eoi_split); reset_priorities(args); From 5053c2ab92a1e7cbfd3705be2f4371bf843aad2c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:34 +0000 Subject: [PATCH 222/260] KVM: arm64: selftests: vgic_irq: Exclude timer-controlled interrupts The PPI injection API is clear that you can't inject the timer PPIs from userspace, since they are controlled by the timers themselves. Add an exclusion list for this purpose. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-45-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 31 ++++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a8919ef3cea2..b0415bdb8952 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -359,8 +359,9 @@ static uint32_t wait_for_and_activate_irq(void) * interrupts for the whole test. */ static void test_inject_preemption(struct test_args *args, - uint32_t first_intid, int num, - kvm_inject_cmd cmd) + uint32_t first_intid, int num, + const unsigned long *exclude, + kvm_inject_cmd cmd) { uint32_t intid, prio, step = KVM_PRIO_STEPS; int i; @@ -379,6 +380,10 @@ static void test_inject_preemption(struct test_args *args, for (i = 0; i < num; i++) { uint32_t tmp; intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + KVM_INJECT(cmd, intid); /* Each successive IRQ will preempt the previous one. */ tmp = wait_for_and_activate_irq(); @@ -390,6 +395,10 @@ static void test_inject_preemption(struct test_args *args, /* finish handling the IRQs starting with the highest priority one. */ for (i = 0; i < num; i++) { intid = num - i - 1 + first_intid; + + if (exclude && test_bit(intid - first_intid, exclude)) + continue; + gic_set_eoi(intid); if (args->eoi_split) gic_set_dir(intid); @@ -397,8 +406,12 @@ static void test_inject_preemption(struct test_args *args, local_irq_enable(); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (exclude && test_bit(i, exclude)) + continue; + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + } GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); GUEST_ASSERT_IAR_EMPTY(); @@ -442,14 +455,20 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) * number of concurrently active IRQs. The number of LRs implemented is * IMPLEMENTATION DEFINED, however, it seems that most implement 4. */ + /* Timer PPIs cannot be injected from userspace */ + static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | + BIT(30 - MIN_PPI) | + BIT(28 - MIN_PPI) | + BIT(26 - MIN_PPI)); + if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, f->cmd); + test_inject_preemption(args, MIN_SGI, 4, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, f->cmd); + test_inject_preemption(args, MIN_PPI, 4, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, f->cmd); + test_inject_preemption(args, MIN_SPI, 4, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) From fd5fa1c8d09a77c0986158af5b522f6d35830329 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:35 +0000 Subject: [PATCH 223/260] KVM: arm64: selftests: vgic_irq: Remove LR-bound limitation Good news: our GIC emulation is not completely broken, and we can activate as many interrupts as we want. Bump the test to cover all the SGIs, all the allowed PPIs, and 31 SPIs. Yes, 31, because we have 31 available priorities, and the test is not happy with having two interrupts with the same priority. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-46-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index b0415bdb8952..9d4761f1a320 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -449,12 +449,6 @@ static void test_injection_failure(struct test_args *args, static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) { - /* - * Test up to 4 levels of preemption. The reason is that KVM doesn't - * currently implement the ability to have more than the number-of-LRs - * number of concurrently active IRQs. The number of LRs implemented is - * IMPLEMENTATION DEFINED, however, it seems that most implement 4. - */ /* Timer PPIs cannot be injected from userspace */ static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | BIT(30 - MIN_PPI) | @@ -462,26 +456,25 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) BIT(26 - MIN_PPI)); if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, NULL, f->cmd); + test_inject_preemption(args, MIN_SGI, 16, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, &ppi_exclude, f->cmd); + test_inject_preemption(args, MIN_PPI, 16, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, NULL, f->cmd); + test_inject_preemption(args, MIN_SPI, 31, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) { - /* Test up to 4 active IRQs. Same reason as in test_preemption. */ if (f->sgi) - guest_restore_active(args, MIN_SGI, 4, f->cmd); + guest_restore_active(args, MIN_SGI, 16, f->cmd); if (f->ppi) - guest_restore_active(args, MIN_PPI, 4, f->cmd); + guest_restore_active(args, MIN_PPI, 16, f->cmd); if (f->spi) - guest_restore_active(args, MIN_SPI, 4, f->cmd); + guest_restore_active(args, MIN_SPI, 31, f->cmd); } static void guest_code(struct test_args *args) From b6c68612ab4171e07a7c2ba8864b967207fc3add Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:36 +0000 Subject: [PATCH 224/260] KVM: arm64: selftests: vgic_irq: Perform EOImode==1 deactivation in ack order When EOImode==1, perform the deactivation in the order of activation, just to make things a bit worse for KVM. Yes, I'm nasty. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-47-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 9d4761f1a320..72f7bb0d201e 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -400,8 +400,18 @@ static void test_inject_preemption(struct test_args *args, continue; gic_set_eoi(intid); - if (args->eoi_split) - gic_set_dir(intid); + } + + if (args->eoi_split) { + for (i = 0; i < num; i++) { + intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + + if (args->eoi_split) + gic_set_dir(intid); + } } local_irq_enable(); From d2dee2e849834564293ec9c33165df56dd441399 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:37 +0000 Subject: [PATCH 225/260] KVM: arm64: selftests: vgic_irq: Add asymmetric SPI deaectivation test Add a new test case that makes an interrupt pending on a vcpu, activates it, do the priority drop, and then get *another* vcpu to do the deactivation. Special care is taken not to trigger an exit in the process, so that we are sure that the active interrupt is in an LR. Joy. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-48-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 105 +++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 72f7bb0d201e..a53ab809fe8a 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -29,6 +29,7 @@ struct test_args { bool level_sensitive; /* 1 is level, 0 is edge */ int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ + uint32_t shared_data; }; /* @@ -801,6 +802,109 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) kvm_vm_free(vm); } +static void guest_code_asym_dir(struct test_args *args, int cpuid) +{ + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + if (cpuid == 0) { + uint32_t intid; + + local_irq_disable(); + + gic_set_priority(MIN_PPI, IRQ_DEFAULT_PRIO); + gic_irq_enable(MIN_SPI); + gic_irq_set_pending(MIN_SPI); + + intid = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(intid, MIN_SPI); + + gic_set_eoi(intid); + isb(); + + WRITE_ONCE(args->shared_data, MIN_SPI); + dsb(ishst); + + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) == MIN_SPI); + GUEST_ASSERT(!gic_irq_get_active(MIN_SPI)); + } else { + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) != MIN_SPI); + + gic_set_dir(MIN_SPI); + isb(); + + WRITE_ONCE(args->shared_data, 0); + dsb(ishst); + } + + GUEST_DONE(); +} + +static void *test_vcpu_run(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + return NULL; +} + +static void test_vgic_two_cpus(void *gcode) +{ + pthread_t thr[2]; + struct kvm_vcpu *vcpus[2]; + struct test_args args = {}; + struct kvm_vm *vm; + vm_vaddr_t args_gva; + int gic_fd, ret; + + vm = vm_create_with_vcpus(2, gcode, vcpus); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpus[0]); + vcpu_init_descriptor_tables(vcpus[1]); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpus[0], 2, args_gva, 0); + vcpu_args_set(vcpus[1], 2, args_gva, 1); + + gic_fd = vgic_v3_setup(vm, 2, 64); + + ret = pthread_create(&thr[0], NULL, test_vcpu_run, vcpus[0]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 0 (%d)\n", ret); + ret = pthread_create(&thr[1], NULL, test_vcpu_run, vcpus[1]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 1 (%d)\n", ret); + + pthread_join(thr[0], NULL); + pthread_join(thr[1], NULL); + + close(gic_fd); + kvm_vm_free(vm); +} + static void help(const char *name) { printf( @@ -857,6 +961,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, false /* level */, true /* eoi_split */); test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + test_vgic_two_cpus(guest_code_asym_dir); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From 1c9c71ac1b9f86b3d1841c703e3e928b2ec224c7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:38 +0000 Subject: [PATCH 226/260] KVM: arm64: selftests: vgic_irq: Add Group-0 enable test Add a new test case that inject a Group-0 interrupt together with a bunch of Group-1 interrupts, Ack/EOI the G1 interrupts, and only then enable G0, expecting to get the G0 interrupt. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-49-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 49 ++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a53ab809fe8a..ff2c75749f5c 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -846,6 +846,54 @@ static void guest_code_asym_dir(struct test_args *args, int cpuid) GUEST_DONE(); } +static void guest_code_group_en(struct test_args *args, int cpuid) +{ + uint32_t intid; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(0); + gic_set_priority_mask(CPU_PRIO_MASK); + /* SGI0 is G0, which is disabled */ + gic_irq_set_group(0, 0); + + /* Configure all SGIs with decreasing priority */ + for (intid = 0; intid < MIN_PPI; intid++) { + gic_set_priority(intid, (intid + 1) * 8); + gic_irq_enable(intid); + gic_irq_set_pending(intid); + } + + /* Ack and EOI all G1 interrupts */ + for (int i = 1; i < MIN_PPI; i++) { + intid = wait_for_and_activate_irq(); + + GUEST_ASSERT(intid < MIN_PPI); + gic_set_eoi(intid); + isb(); + } + + /* + * Check that SGI0 is still pending, inactive, and that we cannot + * ack anything. + */ + GUEST_ASSERT(gic_irq_get_pending(0)); + GUEST_ASSERT(!gic_irq_get_active(0)); + GUEST_ASSERT_IAR_EMPTY(); + GUEST_ASSERT(read_sysreg_s(SYS_ICC_IAR0_EL1) == IAR_SPURIOUS); + + /* Open the G0 gates, and verify we can ack SGI0 */ + write_sysreg_s(1, SYS_ICC_IGRPEN0_EL1); + isb(); + + do { + intid = read_sysreg_s(SYS_ICC_IAR0_EL1); + } while (intid == IAR_SPURIOUS); + + GUEST_ASSERT(intid == 0); + GUEST_DONE(); +} + static void *test_vcpu_run(void *arg) { struct kvm_vcpu *vcpu = arg; @@ -962,6 +1010,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); test_vgic_two_cpus(guest_code_asym_dir); + test_vgic_two_cpus(guest_code_group_en); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From de8842327728d07b5d836688a66ae5fa56902527 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:39 +0000 Subject: [PATCH 227/260] KVM: arm64: selftests: vgic_irq: Add timer deactivation test Add a new test case that triggers the HW deactivation emulation path when trapping ICV_DIR_EL1. This is obviously tied to the way KVM works now, but the test follows the expected architectural behaviour. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-50-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 65 ++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index ff2c75749f5c..9858187c7b6e 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -894,6 +894,70 @@ static void guest_code_group_en(struct test_args *args, int cpuid) GUEST_DONE(); } +static void guest_code_timer_spi(struct test_args *args, int cpuid) +{ + uint32_t intid; + u64 val; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + /* Add a pending SPI so that KVM starts trapping DIR */ + gic_set_priority(MIN_SPI + cpuid, IRQ_DEFAULT_PRIO); + gic_irq_set_pending(MIN_SPI + cpuid); + + /* Configure the timer with a higher priority, make it pending */ + gic_set_priority(27, IRQ_DEFAULT_PRIO - 8); + + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* Enable both interrupts */ + gic_irq_enable(MIN_SPI + cpuid); + gic_irq_enable(27); + + /* The timer must fire */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + /* Check that we can deassert it */ + write_sysreg(0, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(!gic_irq_get_pending(27)); + + /* + * Priority drop, deactivation -- we expect that the host + * deactivation will have been effective + */ + gic_set_eoi(27); + gic_set_dir(27); + + GUEST_ASSERT(!gic_irq_get_active(27)); + + /* Do it one more time */ + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* The timer must fire again */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + GUEST_DONE(); +} + static void *test_vcpu_run(void *arg) { struct kvm_vcpu *vcpu = arg; @@ -1011,6 +1075,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, true /* level */, true /* eoi_split */); test_vgic_two_cpus(guest_code_asym_dir); test_vgic_two_cpus(guest_code_group_en); + test_vgic_two_cpus(guest_code_timer_spi); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From 32bd348be3fa07b26c5ea6b818a161c142dcc2f2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 10 Nov 2025 11:32:27 +0800 Subject: [PATCH 228/260] KVM: Fix last_boosted_vcpu index assignment bug In kvm_vcpu_on_spin(), the loop counter 'i' is incorrectly written to last_boosted_vcpu instead of the actual vCPU index 'idx'. This causes last_boosted_vcpu to store the loop iteration count rather than the vCPU index, leading to incorrect round-robin behavior in subsequent directed yield operations. Fix this by using 'idx' instead of 'i' in the assignment. Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-ID: <20251110033232.12538-7-kernellwp@gmail.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..cde1eddbaa91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4026,7 +4026,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { - WRITE_ONCE(kvm->last_boosted_vcpu, i); + WRITE_ONCE(kvm->last_boosted_vcpu, idx); break; } else if (yielded < 0 && !--try) { break; From 64d67e7add109bfc54eac454558a4355af879ba7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Nov 2025 16:01:44 +0000 Subject: [PATCH 229/260] KVM: arm64: Convert ICH_HCR_EL2_TDIR cap to EARLY_LOCAL_CPU_FEATURE Suzuki notices that making the ICH_HCR_EL2_TDIR capability a system one isn't a very good idea, should we end-up with CPUs that have asymmetric TDIR support (somehow unlikely, but you never know what level of stupidity vendors are up to). For this hypothetical setup, making this an "EARLY_LOCAL_CPU_FEATURE" is a much better option. This is actually consistent with what we already do with GICv5 legacy interface, so flip the capability over. Reported-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Fixes: 2a28810cbb8b2 ("KVM: arm64: GICv3: Detect and work around the lack of ICV_DIR_EL1 trapping") Link: https://lore.kernel.org/r/5df713d4-8b79-4456-8fd1-707ca89a61b6@arm.com Reviewed-by: Suzuki K Poulose Link: https://msgid.link/20251125160144.1086511-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpufeature.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5de51cb1b8fe..75fb9a0efcc8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2325,14 +2325,14 @@ static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); - if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF) && + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && !is_midr_in_range_list(has_vgic_v3)) return false; if (!is_hyp_mode_available()) return false; - if (cpus_have_cap(ARM64_HAS_GICV5_LEGACY)) + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) return true; if (is_kernel_in_hyp_mode()) @@ -2863,7 +2863,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { */ .desc = "ICV_DIR_EL1 trapping", .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, .matches = can_trap_icv_dir_el1, }, #ifdef CONFIG_ARM64_E0PD From 74087611f0ba7b336dcdde855ffca7d15b2ebe0f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 230/260] LoongArch: KVM: Get VM PMU capability from HW GCFG register Now VM PMU capability comes from host PMU capability directly, instead bit 23 of HW GCFG CSR register also show PMU capability for VM. It will be better if it comes from HW GCFG CSR register rather than just host PMU capability, especially when LVZ feature is emulated in TCG mode, in which case without PMU capability. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 8 ++++++ arch/loongarch/include/asm/loongarch.h | 2 ++ arch/loongarch/kvm/vm.c | 40 ++++++++++++++++---------- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 0cecbd038bb3..e4fe5b8e8149 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -126,6 +126,8 @@ struct kvm_arch { struct kvm_phyid_map *phyid_map; /* Enabled PV features */ unsigned long pv_features; + /* Supported KVM features */ + unsigned long kvm_features; s64 time_offset; struct kvm_context __percpu *vmcs; @@ -293,6 +295,12 @@ static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch) return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT; } +/* Check whether KVM support this feature (VMM may disable it) */ +static inline bool kvm_vm_support(struct kvm_arch *arch, int feature) +{ + return !!(arch->kvm_features & BIT_ULL(feature)); +} + bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu); /* Debug: dump vcpu state */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 3de03cb864b2..58a4a3b6b035 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -511,6 +511,8 @@ #define CSR_GCFG_GPERF_SHIFT 24 #define CSR_GCFG_GPERF_WIDTH 3 #define CSR_GCFG_GPERF (_ULCAST_(0x7) << CSR_GCFG_GPERF_SHIFT) +#define CSR_GCFG_GPMP_SHIFT 23 +#define CSR_GCFG_GPMP (_ULCAST_(0x1) << CSR_GCFG_GPMP_SHIFT) #define CSR_GCFG_GCI_SHIFT 20 #define CSR_GCFG_GCI_WIDTH 2 #define CSR_GCFG_GCI (_ULCAST_(0x3) << CSR_GCFG_GCI_SHIFT) diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index a49b1c1a3dd1..e4f480342020 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,23 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +static void kvm_vm_init_features(struct kvm *kvm) +{ + unsigned long val; + + val = read_csr_gcfg(); + if (val & CSR_GCFG_GPMP) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PMU); + + /* Enable all PV features by default */ + kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); + kvm->arch.kvm_features = BIT(KVM_LOONGARCH_VM_FEAT_PV_IPI); + if (kvm_pvtime_supported()) { + kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PV_STEALTIME); + } +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int i; @@ -42,11 +60,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) spin_lock_init(&kvm->arch.phyid_map_lock); kvm_init_vmcs(kvm); - - /* Enable all PV features by default */ - kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); - if (kvm_pvtime_supported()) - kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm_vm_init_features(kvm); /* * cpu_vabits means user address space only (a half of total). @@ -136,20 +150,16 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr if (cpu_has_lbt_mips) return 0; return -ENXIO; - case KVM_LOONGARCH_VM_FEAT_PMU: - if (cpu_has_pmp) - return 0; - return -ENXIO; - case KVM_LOONGARCH_VM_FEAT_PV_IPI: - return 0; - case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: - if (kvm_pvtime_supported()) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_PTW: if (cpu_has_ptw) return 0; return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PMU: + case KVM_LOONGARCH_VM_FEAT_PV_IPI: + case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: + if (kvm_vm_support(&kvm->arch, attr->attr)) + return 0; + return -ENXIO; default: return -ENXIO; } From 7bcd8d0b2237fd0f698bcd0c90badf7c46270d9b Mon Sep 17 00:00:00 2001 From: Song Gao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 231/260] LoongArch: KVM: Add AVEC basic support Check whether the host CPU supported AVEC, and save/restore CSR_MSGIS0- CSR_MSGIS3 when necessary. Reviewed-by: Bibo Mao Signed-off-by: Song Gao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_vcpu.h | 1 + arch/loongarch/include/uapi/asm/kvm.h | 1 + arch/loongarch/kvm/interrupt.c | 15 +++++++++++++-- arch/loongarch/kvm/vcpu.c | 19 +++++++++++++++++-- arch/loongarch/kvm/vm.c | 4 ++++ 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index f1efd7cfbc20..3784ab4ccdb5 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -15,6 +15,7 @@ #define CPU_PMU (_ULCAST_(1) << 10) #define CPU_TIMER (_ULCAST_(1) << 11) #define CPU_IPI (_ULCAST_(1) << 12) +#define CPU_AVEC (_ULCAST_(1) << 14) /* Controlled by 0x52 guest exception VIP aligned to estat bit 5~12 */ #define CPU_IP0 (_ULCAST_(1)) diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index 57ba1a563bb1..de6c3f18e40a 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -104,6 +104,7 @@ struct kvm_fpu { #define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 #define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 #define KVM_LOONGARCH_VM_FEAT_PTW 8 +#define KVM_LOONGARCH_VM_FEAT_MSGINT 9 /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index 8462083f0301..a6d42d399a59 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -21,6 +21,7 @@ static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { [INT_HWI5] = CPU_IP5, [INT_HWI6] = CPU_IP6, [INT_HWI7] = CPU_IP7, + [INT_AVEC] = CPU_AVEC, }; static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) @@ -31,6 +32,11 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; + if (cpu_has_msgint && (priority == INT_AVEC)) { + set_gcsr_estat(irq); + return 1; + } + switch (priority) { case INT_TI: case INT_IPI: @@ -58,6 +64,11 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; + if (cpu_has_msgint && (priority == INT_AVEC)) { + clear_gcsr_estat(irq); + return 1; + } + switch (priority) { case INT_TI: case INT_IPI: @@ -83,10 +94,10 @@ void kvm_deliver_intr(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.irq_pending; unsigned long *pending_clr = &vcpu->arch.irq_clear; - for_each_set_bit(priority, pending_clr, INT_IPI + 1) + for_each_set_bit(priority, pending_clr, EXCCODE_INT_NUM) kvm_irq_clear(vcpu, priority); - for_each_set_bit(priority, pending, INT_IPI + 1) + for_each_set_bit(priority, pending, EXCCODE_INT_NUM) kvm_irq_deliver(vcpu, priority); } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 1245a6b35896..cd5f8d3c3c37 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -659,8 +659,7 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) *v = GENMASK(31, 0); return 0; case LOONGARCH_CPUCFG1: - /* CPUCFG1_MSGINT is not supported by KVM */ - *v = GENMASK(25, 0); + *v = GENMASK(26, 0); return 0; case LOONGARCH_CPUCFG2: /* CPUCFG2 features unconditionally supported by KVM */ @@ -728,6 +727,10 @@ static int kvm_check_cpucfg(int id, u64 val) return -EINVAL; switch (id) { + case LOONGARCH_CPUCFG1: + if ((val & CPUCFG1_MSGINT) && !cpu_has_msgint) + return -EINVAL; + return 0; case LOONGARCH_CPUCFG2: if (!(val & CPUCFG2_LLFTP)) /* Guests must have a constant timer */ @@ -1657,6 +1660,12 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + if (cpu_has_msgint) { + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR3); + } /* Restore Root.GINTC from unused Guest.GINTC register */ write_csr_gintc(csr->csrs[LOONGARCH_CSR_GINTC]); @@ -1746,6 +1755,12 @@ static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + if (cpu_has_msgint) { + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR3); + } vcpu->arch.aux_inuse |= KVM_LARCH_SWCSR_LATEST; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index e4f480342020..194ccbcdc3b3 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -154,6 +154,10 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr if (cpu_has_ptw) return 0; return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_MSGINT: + if (cpu_has_msgint) + return 0; + return -ENXIO; case KVM_LOONGARCH_VM_FEAT_PMU: case KVM_LOONGARCH_VM_FEAT_PV_IPI: case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: From d3e43a1f34acbb9a814337fc5624765538e5a274 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 232/260] LoongArch: KVM: Use 64-bit register definition for EIOINTC With in-kernel emulated eiointc driver, hardware register can be accessed by different size, there is reg_u8/reg_u16/reg_u32/reg_u64 union type with EIOINTC register. Here use 64-bit type with register definition and remove union type since most registers are accessed with 64-bit method. And this makes EIOINTC emulated driver simpler. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_eiointc.h | 55 +++------------- arch/loongarch/kvm/intc/eiointc.c | 80 ++++++++++++------------ 2 files changed, 48 insertions(+), 87 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_eiointc.h b/arch/loongarch/include/asm/kvm_eiointc.h index a3a40aba8acf..8b7a2fa3f7f8 100644 --- a/arch/loongarch/include/asm/kvm_eiointc.h +++ b/arch/loongarch/include/asm/kvm_eiointc.h @@ -10,10 +10,7 @@ #define EIOINTC_IRQS 256 #define EIOINTC_ROUTE_MAX_VCPUS 256 -#define EIOINTC_IRQS_U8_NUMS (EIOINTC_IRQS / 8) -#define EIOINTC_IRQS_U16_NUMS (EIOINTC_IRQS_U8_NUMS / 2) -#define EIOINTC_IRQS_U32_NUMS (EIOINTC_IRQS_U8_NUMS / 4) -#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS_U8_NUMS / 8) +#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS / 64) /* map to ipnum per 32 irqs */ #define EIOINTC_IRQS_NODETYPE_COUNT 16 @@ -64,54 +61,18 @@ struct loongarch_eiointc { uint32_t status; /* hardware state */ - union nodetype { - u64 reg_u64[EIOINTC_IRQS_NODETYPE_COUNT / 4]; - u32 reg_u32[EIOINTC_IRQS_NODETYPE_COUNT / 2]; - u16 reg_u16[EIOINTC_IRQS_NODETYPE_COUNT]; - u8 reg_u8[EIOINTC_IRQS_NODETYPE_COUNT * 2]; - } nodetype; + u64 nodetype[EIOINTC_IRQS_NODETYPE_COUNT / 4]; /* one bit shows the state of one irq */ - union bounce { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } bounce; - - union isr { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } isr; - union coreisr { - u64 reg_u64[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U8_NUMS]; - } coreisr; - union enable { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } enable; + u64 bounce[EIOINTC_IRQS_U64_NUMS]; + u64 isr[EIOINTC_IRQS_U64_NUMS]; + u64 coreisr[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS]; + u64 enable[EIOINTC_IRQS_U64_NUMS]; /* use one byte to config ipmap for 32 irqs at once */ - union ipmap { - u64 reg_u64; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS / 4]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS / 4]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS / 4]; - } ipmap; + u64 ipmap; /* use one byte to config coremap for one irq */ - union coremap { - u64 reg_u64[EIOINTC_IRQS / 8]; - u32 reg_u32[EIOINTC_IRQS / 4]; - u16 reg_u16[EIOINTC_IRQS / 2]; - u8 reg_u8[EIOINTC_IRQS]; - } coremap; + u64 coremap[EIOINTC_IRQS / 8]; DECLARE_BITMAP(sw_coreisr[EIOINTC_ROUTE_MAX_VCPUS][LOONGSON_IP_NUM], EIOINTC_IRQS); uint8_t sw_coremap[EIOINTC_IRQS]; diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index a1cc116b4dac..29886876143f 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -13,19 +13,19 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s) struct kvm_vcpu *vcpu; for (irq = 0; irq < EIOINTC_IRQS; irq++) { - ipnum = s->ipmap.reg_u8[irq / 32]; + ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff; if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { ipnum = count_trailing_zeros(ipnum); ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; } - cpuid = s->coremap.reg_u8[irq]; + cpuid = ((u8 *)s->coremap)[irq]; vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid); if (!vcpu) continue; cpu = vcpu->vcpu_id; - if (test_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu])) + if (test_bit(irq, (unsigned long *)s->coreisr[cpu])) __set_bit(irq, s->sw_coreisr[cpu][ipnum]); else __clear_bit(irq, s->sw_coreisr[cpu][ipnum]); @@ -38,7 +38,7 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) struct kvm_vcpu *vcpu; struct kvm_interrupt vcpu_irq; - ipnum = s->ipmap.reg_u8[irq / 32]; + ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff; if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { ipnum = count_trailing_zeros(ipnum); ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; @@ -53,13 +53,13 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) if (level) { /* if not enable return false */ - if (!test_bit(irq, (unsigned long *)s->enable.reg_u32)) + if (!test_bit(irq, (unsigned long *)s->enable)) return; - __set_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]); + __set_bit(irq, (unsigned long *)s->coreisr[cpu]); found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); __set_bit(irq, s->sw_coreisr[cpu][ipnum]); } else { - __clear_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]); + __clear_bit(irq, (unsigned long *)s->coreisr[cpu]); __clear_bit(irq, s->sw_coreisr[cpu][ipnum]); found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); } @@ -94,7 +94,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, if (s->sw_coremap[irq + i] == cpu) continue; - if (notify && test_bit(irq + i, (unsigned long *)s->isr.reg_u8)) { + if (notify && test_bit(irq + i, (unsigned long *)s->isr)) { /* lower irq at old cpu and raise irq at new cpu */ eiointc_update_irq(s, irq + i, 0); s->sw_coremap[irq + i] = cpu; @@ -108,7 +108,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level) { unsigned long flags; - unsigned long *isr = (unsigned long *)s->isr.reg_u8; + unsigned long *isr = (unsigned long *)s->isr; spin_lock_irqsave(&s->lock, flags); level ? __set_bit(irq, isr) : __clear_bit(irq, isr); @@ -127,27 +127,27 @@ static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eioint switch (offset) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: index = (offset - EIOINTC_NODETYPE_START) >> 3; - data = s->nodetype.reg_u64[index]; + data = s->nodetype[index]; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: index = (offset - EIOINTC_IPMAP_START) >> 3; - data = s->ipmap.reg_u64; + data = s->ipmap; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: index = (offset - EIOINTC_ENABLE_START) >> 3; - data = s->enable.reg_u64[index]; + data = s->enable[index]; break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: index = (offset - EIOINTC_BOUNCE_START) >> 3; - data = s->bounce.reg_u64[index]; + data = s->bounce[index]; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: index = (offset - EIOINTC_COREISR_START) >> 3; - data = s->coreisr.reg_u64[vcpu->vcpu_id][index]; + data = s->coreisr[vcpu->vcpu_id][index]; break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: index = (offset - EIOINTC_COREMAP_START) >> 3; - data = s->coremap.reg_u64[index]; + data = s->coremap[index]; break; default: ret = -EINVAL; @@ -223,26 +223,26 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, switch (offset) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: index = (offset - EIOINTC_NODETYPE_START) >> 3; - old = s->nodetype.reg_u64[index]; - s->nodetype.reg_u64[index] = (old & ~mask) | data; + old = s->nodetype[index]; + s->nodetype[index] = (old & ~mask) | data; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: /* * ipmap cannot be set at runtime, can be set only at the beginning * of irqchip driver, need not update upper irq level */ - old = s->ipmap.reg_u64; - s->ipmap.reg_u64 = (old & ~mask) | data; + old = s->ipmap; + s->ipmap = (old & ~mask) | data; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: index = (offset - EIOINTC_ENABLE_START) >> 3; - old = s->enable.reg_u64[index]; - s->enable.reg_u64[index] = (old & ~mask) | data; + old = s->enable[index]; + s->enable[index] = (old & ~mask) | data; /* * 1: enable irq. * update irq when isr is set. */ - data = s->enable.reg_u64[index] & ~old & s->isr.reg_u64[index]; + data = s->enable[index] & ~old & s->isr[index]; while (data) { irq = __ffs(data); eiointc_update_irq(s, irq + index * 64, 1); @@ -252,7 +252,7 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, * 0: disable irq. * update irq when isr is set. */ - data = ~s->enable.reg_u64[index] & old & s->isr.reg_u64[index]; + data = ~s->enable[index] & old & s->isr[index]; while (data) { irq = __ffs(data); eiointc_update_irq(s, irq + index * 64, 0); @@ -262,16 +262,16 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: /* do not emulate hw bounced irq routing */ index = (offset - EIOINTC_BOUNCE_START) >> 3; - old = s->bounce.reg_u64[index]; - s->bounce.reg_u64[index] = (old & ~mask) | data; + old = s->bounce[index]; + s->bounce[index] = (old & ~mask) | data; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: index = (offset - EIOINTC_COREISR_START) >> 3; /* use attrs to get current cpu index */ cpu = vcpu->vcpu_id; - old = s->coreisr.reg_u64[cpu][index]; + old = s->coreisr[cpu][index]; /* write 1 to clear interrupt */ - s->coreisr.reg_u64[cpu][index] = old & ~data; + s->coreisr[cpu][index] = old & ~data; data &= old; while (data) { irq = __ffs(data); @@ -281,9 +281,9 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: index = (offset - EIOINTC_COREMAP_START) >> 3; - old = s->coremap.reg_u64[index]; - s->coremap.reg_u64[index] = (old & ~mask) | data; - data = s->coremap.reg_u64[index]; + old = s->coremap[index]; + s->coremap[index] = (old & ~mask) | data; + data = s->coremap[index]; eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true); break; default: @@ -451,10 +451,10 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED: eiointc_set_sw_coreisr(s); - for (i = 0; i < (EIOINTC_IRQS / 4); i++) { - start_irq = i * 4; + for (i = 0; i < (EIOINTC_IRQS / 8); i++) { + start_irq = i * 8; eiointc_update_sw_coremap(s, start_irq, - s->coremap.reg_u32[i], sizeof(u32), false); + s->coremap[i], sizeof(u64), false); } break; default: @@ -481,34 +481,34 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, switch (addr) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: offset = (addr - EIOINTC_NODETYPE_START) / 4; - p = &s->nodetype.reg_u32[offset]; + p = s->nodetype + offset * 4; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: offset = (addr - EIOINTC_IPMAP_START) / 4; - p = &s->ipmap.reg_u32[offset]; + p = &s->ipmap + offset * 4; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: offset = (addr - EIOINTC_ENABLE_START) / 4; - p = &s->enable.reg_u32[offset]; + p = s->enable + offset * 4; break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: offset = (addr - EIOINTC_BOUNCE_START) / 4; - p = &s->bounce.reg_u32[offset]; + p = s->bounce + offset * 4; break; case EIOINTC_ISR_START ... EIOINTC_ISR_END: offset = (addr - EIOINTC_ISR_START) / 4; - p = &s->isr.reg_u32[offset]; + p = s->isr + offset * 4; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: if (cpu >= s->num_cpu) return -EINVAL; offset = (addr - EIOINTC_COREISR_START) / 4; - p = &s->coreisr.reg_u32[cpu][offset]; + p = s->coreisr[cpu] + offset * 4; break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: offset = (addr - EIOINTC_COREMAP_START) / 4; - p = &s->coremap.reg_u32[offset]; + p = s->coremap + offset * 4; break; default: kvm_err("%s: unknown eiointc register, addr = %d\n", __func__, addr); From 985a96983bedf04fa61315e68806f3468450c8a1 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 233/260] KVM: LoongArch: selftests: Add system registers save/restore on exception When system returns from exception with ertn instruction, PC comes from LOONGARCH_CSR_ERA, and CSR.CRMD comes LOONGARCH_CSR_PRMD. Here save CSR register CSR.ERA and CSR.PRMD into stack, and then restore them from stack. So it can be modified by exception handlers in future. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- tools/testing/selftests/kvm/include/loongarch/processor.h | 5 ++++- tools/testing/selftests/kvm/lib/loongarch/exception.S | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index 6427a3275e6a..374caddfb0db 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -124,18 +124,21 @@ struct ex_regs { unsigned long pc; unsigned long estat; unsigned long badv; + unsigned long prmd; }; #define PC_OFFSET_EXREGS offsetof(struct ex_regs, pc) #define ESTAT_OFFSET_EXREGS offsetof(struct ex_regs, estat) #define BADV_OFFSET_EXREGS offsetof(struct ex_regs, badv) +#define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) #else #define PC_OFFSET_EXREGS ((EXREGS_GPRS + 0) * 8) #define ESTAT_OFFSET_EXREGS ((EXREGS_GPRS + 1) * 8) #define BADV_OFFSET_EXREGS ((EXREGS_GPRS + 2) * 8) -#define EXREGS_SIZE ((EXREGS_GPRS + 3) * 8) +#define PRMD_OFFSET_EXREGS ((EXREGS_GPRS + 3) * 8) +#define EXREGS_SIZE ((EXREGS_GPRS + 4) * 8) #endif #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/loongarch/exception.S b/tools/testing/selftests/kvm/lib/loongarch/exception.S index 88bfa505c6f5..3f1e4b67c5ae 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/exception.S +++ b/tools/testing/selftests/kvm/lib/loongarch/exception.S @@ -51,9 +51,15 @@ handle_exception: st.d t0, sp, ESTAT_OFFSET_EXREGS csrrd t0, LOONGARCH_CSR_BADV st.d t0, sp, BADV_OFFSET_EXREGS + csrrd t0, LOONGARCH_CSR_PRMD + st.d t0, sp, PRMD_OFFSET_EXREGS or a0, sp, zero bl route_exception + ld.d t0, sp, PC_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_ERA + ld.d t0, sp, PRMD_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_PRMD restore_gprs sp csrrd sp, LOONGARCH_CSR_KS0 ertn From 1c5d3a1eab32db1ebb0d5d30736e9236e8a7014f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 234/260] KVM: LoongArch: selftests: Add basic interfaces Add some basic function interfaces such as CSR register access, local irq enable or disable APIs. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- .../kvm/include/loongarch/processor.h | 50 +++++++++++++++++++ .../selftests/kvm/lib/loongarch/processor.c | 5 ++ 2 files changed, 55 insertions(+) diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index 374caddfb0db..f1bfc06a2264 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -113,6 +113,28 @@ #define CSR_TLBREHI_PS_SHIFT 0 #define CSR_TLBREHI_PS (0x3fUL << CSR_TLBREHI_PS_SHIFT) +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__( \ + "csrrd %[val], %[reg]\n\t" \ + : [val] "=r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +#define csr_write(v, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__ ( \ + "csrwr %[val], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + #define EXREGS_GPRS (32) #ifndef __ASSEMBLER__ @@ -133,6 +155,34 @@ struct ex_regs { #define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) +static inline void cpu_relax(void) +{ + asm volatile("nop" ::: "memory"); +} + +static inline void local_irq_enable(void) +{ + unsigned int flags = CSR_CRMD_IE; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} + +static inline void local_irq_disable(void) +{ + unsigned int flags = 0; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} #else #define PC_OFFSET_EXREGS ((EXREGS_GPRS + 0) * 8) #define ESTAT_OFFSET_EXREGS ((EXREGS_GPRS + 1) * 8) diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 0ac1abcb71cb..08b4cef48e44 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -192,6 +192,11 @@ void route_exception(struct ex_regs *regs) while (1) ; } +uint32_t guest_get_vcpuid(void) +{ + return csr_read(LOONGARCH_CSR_CPUID); +} + void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { int i; From d84fe2f30b0a0cbe08260c00a84ffe42161c95ce Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 27 Nov 2025 11:00:18 +0800 Subject: [PATCH 235/260] KVM: LoongArch: selftests: Add exception handler register interface Add interrupt and exception handler register interface. When exception happens, execute registered exception handler if exists, else report an error. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- .../kvm/include/loongarch/processor.h | 16 ++++++++++ .../selftests/kvm/lib/loongarch/processor.c | 29 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index f1bfc06a2264..a1930f28e044 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -84,6 +84,11 @@ #define LOONGARCH_CSR_EUEN 0x2 #define LOONGARCH_CSR_ECFG 0x4 #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ +#define CSR_ESTAT_EXC_SHIFT 16 +#define CSR_ESTAT_EXC_WIDTH 6 +#define CSR_ESTAT_EXC (0x3f << CSR_ESTAT_EXC_SHIFT) +#define EXCCODE_INT 0 /* Interrupt */ +#define INT_TI 11 /* Timer interrupt*/ #define LOONGARCH_CSR_ERA 0x6 /* ERA */ #define LOONGARCH_CSR_BADV 0x7 /* Bad virtual address */ #define LOONGARCH_CSR_EENTRY 0xc @@ -155,6 +160,17 @@ struct ex_regs { #define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) +#define VECTOR_NUM 64 + +typedef void(*handler_fn)(struct ex_regs *); + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM]; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler); + static inline void cpu_relax(void) { asm volatile("nop" ::: "memory"); diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 08b4cef48e44..b2a1fa7b18da 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -11,6 +11,7 @@ #define LOONGARCH_GUEST_STACK_VADDR_MIN 0x200000 static vm_paddr_t invalid_pgtable[4]; +static vm_vaddr_t exception_handlers; static uint64_t virt_pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { @@ -183,7 +184,14 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) void route_exception(struct ex_regs *regs) { + int vector; unsigned long pc, estat, badv; + struct handlers *handlers; + + handlers = (struct handlers *)exception_handlers; + vector = (regs->estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + if (handlers && handlers->exception_handlers[vector]) + return handlers->exception_handlers[vector](regs); pc = regs->pc; badv = regs->badv; @@ -192,6 +200,27 @@ void route_exception(struct ex_regs *regs) while (1) ; } +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + void *addr; + + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + LOONGARCH_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); + + addr = addr_gva2hva(vm, vm->handlers); + memset(addr, 0, vm->page_size); + exception_handlers = vm->handlers; + sync_global_to_guest(vm, exception_handlers); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector] = handler; +} + uint32_t guest_get_vcpuid(void) { return csr_read(LOONGARCH_CSR_CPUID); From c067847c52e26eceed9f8a938c04456880c486fa Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 26 Nov 2025 16:33:10 +1100 Subject: [PATCH 236/260] KVM: s390: Add signal_exits counter Add a signal_exits counter for s390, as exists on arm64, loongarch, mips, powerpc, riscv and x86. This is used by kvm_handle_signal_exit(), which we will use when we later enable CONFIG_VIRT_XFER_TO_GUEST_WORK. Signed-off-by: Andrew Donnellan Reviewed-by: Janosch Frank Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1e4829c70216..ae1223264d3c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -146,6 +146,7 @@ struct kvm_vcpu_stat { u64 instruction_diagnose_500; u64 instruction_diagnose_other; u64 pfault_sync; + u64 signal_exits; }; #define PGM_OPERATION 0x01 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 56d4730b7c41..8db37e508a71 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -185,7 +185,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), - STATS_DESC_COUNTER(VCPU, pfault_sync) + STATS_DESC_COUNTER(VCPU, pfault_sync), + STATS_DESC_COUNTER(VCPU, signal_exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -5251,6 +5252,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->stat.signal_exits++; rc = -EINTR; } From d0139059e31acd5fea49737558297d801c406638 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Nov 2025 16:33:11 +1100 Subject: [PATCH 237/260] KVM: s390: Enable and disable interrupts in entry code Move enabling and disabling of interrupts around the SIE instruction to entry code. Enabling interrupts only after the __TI_sie flag has been set guarantees that the SIE instruction is not executed if an interrupt happens between enabling interrupts and the execution of the SIE instruction. Interrupt handlers and machine check handler forward the PSW to the sie_exit label in such cases. This is a prerequisite for VIRT_XFER_TO_GUEST_WORK to prevent that guest context is entered when e.g. a scheduler IPI, indicating that a reschedule is required, happens right before the SIE instruction, which could lead to long delays. Signed-off-by: Heiko Carstens Tested-by: Andrew Donnellan Signed-off-by: Andrew Donnellan Reviewed-by: Janosch Frank Signed-off-by: Janosch Frank --- arch/s390/include/asm/stacktrace.h | 1 + arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 2 ++ arch/s390/kvm/kvm-s390.c | 5 ----- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 810a6b9d9628..c9ae680a28af 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -66,6 +66,7 @@ struct stack_frame { unsigned long sie_flags; unsigned long sie_control_block_phys; unsigned long sie_guest_asce; + unsigned long sie_irq; }; }; unsigned long gprs[10]; diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index a8915663e917..730449f464af 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -64,6 +64,7 @@ int main(void) OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); + OFFSET(__SF_SIE_IRQ, stack_frame, sie_irq); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 75b0fbb236d0..e906f4ab6cf3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -189,6 +189,7 @@ SYM_FUNC_START(__sie64a) mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 mvi __TI_sie(%r14),1 + stosm __SF_SIE_IRQ(%r15),0x03 # enable interrupts lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now @@ -212,6 +213,7 @@ SYM_FUNC_START(__sie64a) lg %r14,__LC_CURRENT(%r14) mvi __TI_sie(%r14),0 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) + stnsm __SF_SIE_IRQ(%r15),0xfc # disable interrupts lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8db37e508a71..4d13601ec217 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4962,13 +4962,8 @@ int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, * The guest_state_{enter,exit}_irqoff() functions inform lockdep and * tracing that entry to the guest will enable host IRQs, and exit from * the guest will disable host IRQs. - * - * We must not use lockdep/tracing/RCU in this critical section, so we - * use the low-level arch_local_irq_*() helpers to enable/disable IRQs. */ - arch_local_irq_enable(); ret = sie64a(scb, gprs, gasce); - arch_local_irq_disable(); guest_state_exit_irqoff(); From df41742343fad11fde06e085096003d64599785f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 28 Nov 2025 14:49:44 +0800 Subject: [PATCH 238/260] KVM: LoongArch: selftests: Add timer interrupt test case Add timer test case based on common arch_timer code, timer interrupt with one-shot and period mode is tested. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../kvm/include/loongarch/arch_timer.h | 85 ++++++++++++ .../kvm/include/loongarch/processor.h | 10 ++ .../selftests/kvm/lib/loongarch/processor.c | 4 +- .../selftests/kvm/loongarch/arch_timer.c | 130 ++++++++++++++++++ 5 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/loongarch/arch_timer.h create mode 100644 tools/testing/selftests/kvm/loongarch/arch_timer.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..9d01f4d0e3f9 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -210,6 +210,7 @@ TEST_GEN_PROGS_riscv += mmu_stress_test TEST_GEN_PROGS_riscv += rseq_test TEST_GEN_PROGS_riscv += steal_time +TEST_GEN_PROGS_loongarch = arch_timer TEST_GEN_PROGS_loongarch += coalesced_io_test TEST_GEN_PROGS_loongarch += demand_paging_test TEST_GEN_PROGS_loongarch += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/include/loongarch/arch_timer.h b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h new file mode 100644 index 000000000000..2ed106b32c81 --- /dev/null +++ b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * LoongArch Constant Timer specific interface + */ +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include "processor.h" + +/* LoongArch timer frequency is constant 100MHZ */ +#define TIMER_FREQ (100UL << 20) +#define msec_to_cycles(msec) (TIMER_FREQ * (unsigned long)(msec) / 1000) +#define usec_to_cycles(usec) (TIMER_FREQ * (unsigned long)(usec) / 1000000) +#define cycles_to_usec(cycles) ((unsigned long)(cycles) * 1000000 / TIMER_FREQ) + +static inline unsigned long timer_get_cycles(void) +{ + unsigned long val = 0; + + __asm__ __volatile__( + "rdtime.d %0, $zero\n\t" + : "=r"(val) + : + ); + + return val; +} + +static inline unsigned long timer_get_cfg(void) +{ + return csr_read(LOONGARCH_CSR_TCFG); +} + +static inline unsigned long timer_get_val(void) +{ + return csr_read(LOONGARCH_CSR_TVAL); +} + +static inline void disable_timer(void) +{ + csr_write(0, LOONGARCH_CSR_TCFG); +} + +static inline void timer_irq_enable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val |= ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_irq_disable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val &= ~ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_set_next_cmp_ms(unsigned int msec, bool period) +{ + unsigned long val; + + val = msec_to_cycles(msec) & CSR_TCFG_VAL; + val |= CSR_TCFG_EN; + if (period) + val |= CSR_TCFG_PERIOD; + csr_write(val, LOONGARCH_CSR_TCFG); +} + +static inline void __delay(uint64_t cycles) +{ + uint64_t start = timer_get_cycles(); + + while ((timer_get_cycles() - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index a1930f28e044..76840ddda57d 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -83,6 +83,8 @@ #define LOONGARCH_CSR_PRMD 0x1 #define LOONGARCH_CSR_EUEN 0x2 #define LOONGARCH_CSR_ECFG 0x4 +#define ECFGB_TIMER 11 +#define ECFGF_TIMER (BIT_ULL(ECFGB_TIMER)) #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ #define CSR_ESTAT_EXC_SHIFT 16 #define CSR_ESTAT_EXC_WIDTH 6 @@ -111,6 +113,14 @@ #define LOONGARCH_CSR_KS1 0x31 #define LOONGARCH_CSR_TMID 0x40 #define LOONGARCH_CSR_TCFG 0x41 +#define CSR_TCFG_VAL (BIT_ULL(48) - BIT_ULL(2)) +#define CSR_TCFG_PERIOD_SHIFT 1 +#define CSR_TCFG_PERIOD (0x1UL << CSR_TCFG_PERIOD_SHIFT) +#define CSR_TCFG_EN (0x1UL) +#define LOONGARCH_CSR_TVAL 0x42 +#define LOONGARCH_CSR_TINTCLR 0x44 /* Timer interrupt clear */ +#define CSR_TINTCLR_TI_SHIFT 0 +#define CSR_TINTCLR_TI (1 << CSR_TINTCLR_TI_SHIFT) /* TLB refill exception entry */ #define LOONGARCH_CSR_TLBRENTRY 0x88 #define LOONGARCH_CSR_TLBRSAVE 0x8b diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index b2a1fa7b18da..a1b16140942b 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -276,8 +276,8 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - /* user mode and page enable mode */ - val = PLV_USER | CSR_CRMD_PG; + /* kernel mode and page enable mode */ + val = PLV_KERN | CSR_CRMD_PG; loongarch_set_csr(vcpu, LOONGARCH_CSR_CRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_PRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_EUEN, 1); diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c new file mode 100644 index 000000000000..6cc671671663 --- /dev/null +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The test validates periodic/one-shot constant timer IRQ using + * CSR.TCFG and CSR.TVAL registers. + */ +#include "arch_timer.h" +#include "kvm_util.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid; + uint32_t cpu = guest_get_vcpuid(); + uint64_t xcnt, val, cfg, xcnt_diff_us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + intid = !!(regs->estat & BIT(INT_TI)); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, 1); + + cfg = timer_get_cfg(); + if (cfg & CSR_TCFG_PERIOD) { + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter - 1); + if (shared_data->nr_iter == 0) + disable_timer(); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + return; + } + + /* + * On real machine, value of LOONGARCH_CSR_TVAL is BIT_ULL(48) - 1 + * On virtual machine, its value counts down from BIT_ULL(48) - 1 + */ + val = timer_get_val(); + xcnt = timer_get_cycles(); + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Basic 'timer condition met' check */ + __GUEST_ASSERT(val > cfg, + "val = 0x%lx, cfg = 0x%lx, xcnt_diff_us = 0x%lx", + val, cfg, xcnt_diff_us); + + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_test_period_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = test_args.nr_iter; + shared_data->xcnt = timer_get_cycles(); + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + timer_set_next_cmp_ms(test_args.timer_period_ms, true); + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + } + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(irq_iter == 0, + "irq_iter = 0x%x.\n" + " Guest period timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + irq_iter); +} + +static void guest_test_oneshot_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = 0; + shared_data->guest_stage = 0; + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + + timer_irq_enable(); + local_irq_enable(); + guest_test_period_timer(cpu); + guest_test_oneshot_timer(cpu); + + GUEST_DONE(); +} + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + vm_init_descriptor_tables(vm); + vm_install_exception_handler(vm, EXCCODE_INT, guest_irq_handler); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} From 4e8824094069b04e3b3583d855c975ccb6a9bec5 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 28 Nov 2025 14:49:47 +0800 Subject: [PATCH 239/260] KVM: LoongArch: selftests: Add SW emulated timer test case This test case setup one-shot timer and execute idle instruction immediately to indicate giving up CPU, hypervisor will emulate SW hrtimer and wakeup vCPU when SW hrtimer is fired. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- .../selftests/kvm/loongarch/arch_timer.c | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c index 6cc671671663..baa30fd296f5 100644 --- a/tools/testing/selftests/kvm/loongarch/arch_timer.c +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -9,6 +9,21 @@ #include "timer_test.h" #include "ucall_common.h" +static void do_idle(void) +{ + unsigned int intid; + unsigned long estat; + + __asm__ __volatile__("idle 0" : : : "memory"); + + estat = csr_read(LOONGARCH_CSR_ESTAT); + intid = !!(estat & BIT(INT_TI)); + + /* Make sure pending timer IRQ arrived */ + GUEST_ASSERT_EQ(intid, 1); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); +} + static void guest_irq_handler(struct ex_regs *regs) { unsigned int intid; @@ -97,6 +112,30 @@ static void guest_test_oneshot_timer(uint32_t cpu) } } +static void guest_test_emulate_timer(uint32_t cpu) +{ + uint32_t config_iter; + uint64_t xcnt_diff_us, us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + local_irq_disable(); + shared_data->nr_iter = 0; + us = msecs_to_usecs(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + do_idle(); + + xcnt_diff_us = cycles_to_usec(timer_get_cycles() - shared_data->xcnt); + __GUEST_ASSERT(xcnt_diff_us >= us, + "xcnt_diff_us = 0x%lx, us = 0x%lx.\n", + xcnt_diff_us, us); + } + local_irq_enable(); +} + static void guest_code(void) { uint32_t cpu = guest_get_vcpuid(); @@ -105,6 +144,7 @@ static void guest_code(void) local_irq_enable(); guest_test_period_timer(cpu); guest_test_oneshot_timer(cpu); + guest_test_emulate_timer(cpu); GUEST_DONE(); } From 0f90fa6e2e9d98349492d9968c11ceaf2f958c98 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 28 Nov 2025 14:49:48 +0800 Subject: [PATCH 240/260] KVM: LoongArch: selftests: Add time counter test case With time counter test, it is to verify that time count starts from 0 and always grows up then. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- .../selftests/kvm/lib/loongarch/processor.c | 9 ++++++ .../selftests/kvm/loongarch/arch_timer.c | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index a1b16140942b..07c103369ddb 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -3,6 +3,7 @@ #include #include +#include #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" @@ -245,6 +246,11 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) vcpu_regs_set(vcpu, ®s); } +static void loongarch_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + __vcpu_set_reg(vcpu, id, val); +} + static void loongarch_get_csr(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { uint64_t csrid; @@ -285,7 +291,10 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) loongarch_set_csr(vcpu, LOONGARCH_CSR_TCFG, 0); loongarch_set_csr(vcpu, LOONGARCH_CSR_ASID, 1); + /* time count start from 0 */ val = 0; + loongarch_set_reg(vcpu, KVM_REG_LOONGARCH_COUNTER, val); + width = vm->page_shift - 3; switch (vm->pgtable_levels) { diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c index baa30fd296f5..355ecac30954 100644 --- a/tools/testing/selftests/kvm/loongarch/arch_timer.c +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -136,10 +136,40 @@ static void guest_test_emulate_timer(uint32_t cpu) local_irq_enable(); } +static void guest_time_count_test(uint32_t cpu) +{ + uint32_t config_iter; + unsigned long start, end, prev, us; + + /* Assuming that test case starts to run in 1 second */ + start = timer_get_cycles(); + us = msec_to_cycles(1000); + __GUEST_ASSERT(start <= us, + "start = 0x%lx, us = 0x%lx.\n", + start, us); + + us = msec_to_cycles(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + start = timer_get_cycles(); + end = start + us; + /* test time count growing up always */ + while (start < end) { + prev = start; + start = timer_get_cycles(); + __GUEST_ASSERT(prev <= start, + "prev = 0x%lx, start = 0x%lx.\n", + prev, start); + } + } +} + static void guest_code(void) { uint32_t cpu = guest_get_vcpuid(); + /* must run at first */ + guest_time_count_test(cpu); + timer_irq_enable(); local_irq_enable(); guest_test_period_timer(cpu); From 2bd1337a1295e012e60008ee21a64375e5234e12 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 26 Nov 2025 16:33:12 +1100 Subject: [PATCH 241/260] KVM: s390: Use generic VIRT_XFER_TO_GUEST_WORK functions Switch to using the generic infrastructure to check for and handle pending work before transitioning into guest mode. xfer_to_guest_mode_handle_work() does a few more things than the current code does when deciding whether or not to exit the __vcpu_run() loop. The exittime tests from kvm-unit-tests, in my tests, were within a few percent compared to before this series, which is within noise tolerance. Co-developed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Andrew Donnellan Acked-by: Janosch Frank [frankja@linux.ibm.com: Removed semicolon] Signed-off-by: Janosch Frank --- arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/kvm-s390.c | 25 ++++++++++++++++++------- arch/s390/kvm/vsie.c | 18 +++++++++++++----- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..0ca9d6587243 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -30,6 +30,7 @@ config KVM select HAVE_KVM_NO_POLL select KVM_VFIO select MMU_NOTIFIER + select VIRT_XFER_TO_GUEST_WORK help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4d13601ec217..d31155e371df 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include +#include #include #include #include @@ -4675,9 +4676,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14]; vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15]; - if (need_resched()) - schedule(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); if (rc || guestdbg_exit_pending(vcpu)) @@ -4982,12 +4980,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) */ kvm_vcpu_srcu_read_lock(vcpu); - do { + while (true) { rc = vcpu_pre_run(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc || guestdbg_exit_pending(vcpu)) break; - kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between * guest_timing_enter_irqoff and guest_timing_exit_irqoff @@ -4999,7 +4997,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) sizeof(sie_page->pv_grregs)); } +xfer_to_guest_mode_check: local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + break; + goto xfer_to_guest_mode_check; + } + guest_timing_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); @@ -5029,9 +5037,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); - } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); + if (rc || guestdbg_exit_pending(vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); + break; + } + } - kvm_vcpu_srcu_read_unlock(vcpu); return rc; } diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d23ab5120888..b526621d2a1b 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1180,12 +1180,23 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_int_code = 0; barrier(); if (!kvm_s390_vcpu_sie_inhibited(vcpu)) { +xfer_to_guest_mode_check: local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + goto skip_sie; + goto xfer_to_guest_mode_check; + } guest_timing_enter_irqoff(); rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); guest_timing_exit_irqoff(); local_irq_enable(); } + +skip_sie: barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; @@ -1345,13 +1356,11 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * but rewind the PSW to re-enter SIE once that's completed * instead of passing a "no action" intercept to the guest. */ - if (signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0) || + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); break; } - cond_resched(); } if (rc == -EFAULT) { @@ -1483,8 +1492,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) { + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); return 0; } From 6b49f70022ed607bab66da60c7b332f39cda4ff1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:46 -0800 Subject: [PATCH 242/260] KVM: arm64: Teach ptdump about FEAT_XNX permissions Although KVM doesn't make direct use of the feature, guest hypervisors can use FEAT_XNX which influences the permissions of the shadow stage-2. Update ptdump to separately print the privileged and unprivileged execute permissions. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-5-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/ptdump.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index dc5acfb00af9..6cbe018fd6fd 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -31,27 +31,46 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .val = PTE_VALID, .set = " ", .clear = "F", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .set = "R", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .set = "W", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .val = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .set = "NX", - .clear = "x ", - }, { + .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px ux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNUXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px UXN", + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, .set = "AF", .clear = " ", - }, { + }, + { .mask = PMD_TYPE_MASK, .val = PMD_TYPE_SECT, .set = "BLK", From 692650bd7b12532798c2022cb53869a07a288fbe Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:47 -0800 Subject: [PATCH 243/260] KVM: arm64: nv: Advertise support for FEAT_XNX Everything is in place to support FEAT_XNX, advertise support. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-6-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 92b2a69f0b89..08839a320a45 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1559,7 +1559,6 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) val &= ~(ID_AA64MMFR1_EL1_CMOW | ID_AA64MMFR1_EL1_nTLBPA | ID_AA64MMFR1_EL1_ETS | - ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_HAFDBS); /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) From cdba9da34b145eb2f3c502279a454f9a1a8346c1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:48 -0800 Subject: [PATCH 244/260] KVM: arm64: Call helper for reading descriptors directly Going through a function pointer doesn't serve much purpose when there's only one implementation. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-7-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 08839a320a45..e4928a6a3672 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,7 +124,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - int (*read_desc)(phys_addr_t pa, u64 *desc, void *data); void *data; u64 baddr; unsigned int max_oa_bits; @@ -199,6 +198,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } +static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +{ + struct kvm_vcpu *vcpu = data; + + return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); +} + /* * This is essentially a C-version of the pseudo code from the ARM ARM * AArch64.TranslationTableWalk function. I strongly recommend looking at @@ -257,7 +263,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = wi->read_desc(paddr, &desc, wi->data); + ret = read_guest_s2_desc(paddr, &desc, wi->data); if (ret < 0) return ret; @@ -325,13 +331,6 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, return 0; } -static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) -{ - struct kvm_vcpu *vcpu = data; - - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); -} - static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) { wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; @@ -364,7 +363,6 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.read_desc = read_guest_s2_desc; wi.data = vcpu; wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); From 977d1bf15c5179276d93468abbc00a224908ff72 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:49 -0800 Subject: [PATCH 245/260] KVM: arm64: nv: Stop passing vCPU through void ptr in S2 PTW The stage-2 table walker passes down the vCPU as a void pointer. That might've made sense if the walker was generic although at this point it is clear this will only ever be used in the context of a vCPU. Suggested-by: Marc Zyngier Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-8-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index e4928a6a3672..94eff8307aad 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,7 +124,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - void *data; u64 baddr; unsigned int max_oa_bits; unsigned int pgshift; @@ -198,10 +197,8 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } -static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc) { - struct kvm_vcpu *vcpu = data; - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); } @@ -212,7 +209,7 @@ static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) * * Must be called with the kvm->srcu read lock held */ -static int walk_nested_s2_pgd(phys_addr_t ipa, +static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, struct s2_walk_info *wi, struct kvm_s2_trans *out) { int first_block_level, level, stride, input_size, base_lower_bound; @@ -263,7 +260,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = read_guest_s2_desc(paddr, &desc, wi->data); + ret = read_guest_s2_desc(vcpu, paddr, &desc); if (ret < 0) return ret; @@ -363,14 +360,13 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.data = vcpu; wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); vtcr_to_walk_info(vtcr, &wi); wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; - ret = walk_nested_s2_pgd(gipa, &wi, result); + ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); if (ret) result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); From fabf321cba4be0d0dcbb39e97c3deb572fec2f8d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:50 -0800 Subject: [PATCH 246/260] KVM: arm64: Handle endianness in read helper for emulated PTW Implementing FEAT_HAFDBS means adding another descriptor accessor that needs to deal with the guest-configured endianness. Prepare by moving the endianness handling into the read accessor and out of the main body of the S1/S2 PTWs. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-9-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 25 +++++++++++++++++++------ arch/arm64/kvm/nested.c | 32 ++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index be26d5aa668c..a295a37dd3b1 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -362,6 +362,24 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return -EFAULT; } +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -414,17 +432,12 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return ret; } - ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); - /* Invalid descriptor */ if (!(desc & BIT(0))) goto transfault; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 94eff8307aad..75d26c0ba3e0 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -197,9 +197,26 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } -static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc) +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, + struct s2_walk_info *wi) { - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; } /* @@ -260,19 +277,10 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = read_guest_s2_desc(vcpu, paddr, &desc); + ret = read_guest_s2_desc(vcpu, paddr, &desc, wi); if (ret < 0) return ret; - /* - * Handle reversedescriptors if endianness differs between the - * host and the guest hypervisor. - */ - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); - /* Check for valid descriptor at this point */ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); From 590e694820bfd70e3de78fcb98b16c98a905230e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:51 -0800 Subject: [PATCH 247/260] KVM: arm64: nv: Use pgtable definitions in stage-2 walk Use the existing page table definitions instead of magic numbers for the stage-2 table walk. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-10-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 75d26c0ba3e0..a096766c6ec3 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -282,14 +282,23 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return ret; /* Check for valid descriptor at this point */ - if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { + if (!(desc & KVM_PTE_VALID)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); out->desc = desc; return 1; } - /* We're at the final level or block translation level */ - if ((desc & 3) == 1 || level == 3) + if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) { + if (level < 3) + break; + + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level */ + if (level == 3) break; if (check_output_size(wi, desc)) { @@ -316,7 +325,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 1; } - if (!(desc & BIT(10))) { + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); out->desc = desc; return 1; @@ -329,8 +338,8 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, (ipa & GENMASK_ULL(addr_bottom - 1, 0)); out->output = paddr; out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); - out->readable = desc & (0b01 << 6); - out->writable = desc & (0b10 << 6); + out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; out->level = level; out->desc = desc; return 0; From f6927b41d57390c597a126063e2e518911976878 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:52 -0800 Subject: [PATCH 248/260] KVM: arm64: Add helper for swapping guest descriptor Implementing FEAT_HAFDBS in KVM's software PTWs requires the ability to CAS a descriptor to update the in-memory value. Add an accessor to do exactly that, coping with the fact that guest descriptors are in user memory (duh). While FEAT_LSE required on any system that implements NV, KVM now uses the stage-1 PTW for non-nested use cases meaning an LL/SC implementation is necessary as well. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-11-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 2 + arch/arm64/kvm/at.c | 87 +++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 5d967b60414c..6dbc2908aed9 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -403,4 +403,6 @@ void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); (FIX_VNCR - __c); \ }) +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new); + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index a295a37dd3b1..581c4c49d9cd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1650,3 +1650,90 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) return ret; } } + +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} From 92c6443222ca4289191d797ac79176c560886998 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:53 -0800 Subject: [PATCH 249/260] KVM: arm64: Propagate PTW errors up to AT emulation KVM's software PTW will soon support 'hardware' updates to the access flag. Similar to fault handling, races to update the descriptor will be handled by restarting the instruction. Prepare for this by propagating errors up to the AT emulation, only retiring the instruction if the walk succeeds. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-12-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 6 ++--- arch/arm64/kvm/at.c | 43 ++++++++++++++++++++++---------- arch/arm64/kvm/sys_regs.c | 9 ++++--- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9da54d4ee49e..090f7b740bdc 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -246,9 +246,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); -extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 581c4c49d9cd..2a99380ada6f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1234,7 +1234,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, wr->pr &= !pan; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) { struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; @@ -1259,6 +1259,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; if (ret) goto compute_par; @@ -1292,7 +1297,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wi, &wr); + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; } /* @@ -1420,9 +1426,10 @@ static bool par_check_s1_access_fault(u64 par) !(par & SYS_PAR_EL1_S)); } -void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; /* * If PAR_EL1 reports that AT failed on a S1 permission or access @@ -1434,15 +1441,20 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par) && - !par_check_s1_access_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par; + int ret; /* * We've trapped, so everything is live on the CPU. As we will be @@ -1489,13 +1501,17 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } /* We failed the translation, let's replay it in slow motion */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct kvm_s2_trans out = {}; u64 ipa, par; @@ -1522,13 +1538,13 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) break; default: WARN_ON_ONCE(1); - return; + return 0; } __kvm_at_s1e01(vcpu, op, vaddr); par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) - return; + return 0; /* * If we only have a single stage of translation (EL2&0), exit @@ -1536,14 +1552,14 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if (compute_translation_regime(vcpu, op) == TR_EL20 || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) - return; + return 0; /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) - return; + return ret; /* Check the access permission */ if (!out.esr && @@ -1552,6 +1568,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..61830eb3607c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3767,7 +3767,8 @@ static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s1e01(vcpu, op, p->regval); + if (__kvm_at_s1e01(vcpu, op, p->regval)) + return false; return true; } @@ -3784,7 +3785,8 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; } - __kvm_at_s1e2(vcpu, op, p->regval); + if (__kvm_at_s1e2(vcpu, op, p->regval)) + return false; return true; } @@ -3794,7 +3796,8 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s12(vcpu, op, p->regval); + if (__kvm_at_s12(vcpu, op, p->regval)) + return false; return true; } From bff8aa213dee742b09151a34494418050afed948 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:54 -0800 Subject: [PATCH 250/260] KVM: arm64: Implement HW access flag management in stage-1 SW PTW Atomically update the Access flag at stage-1 when the guest has configured the MMU to do so. Make the implementation choice (and liberal interpretation of speculation) that any access type updates the Access flag, including AT and CMO instructions. Restart the entire walk by returning to the exception-generating instruction in the case of a failed Access flag update. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-13-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/at.c | 33 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 6dbc2908aed9..905c658057a4 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -353,6 +353,7 @@ struct s1_walk_info { bool be; bool s2; bool pa52bit; + bool ha; }; struct s1_walk_result { diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 2a99380ada6f..e39f814d247f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,6 +346,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + wi->ha = tcr & TCR_HA; + return 0; addrsz: @@ -380,10 +382,24 @@ static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, return 0; } +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = cpu_to_be64(old); + new = cpu_to_be64(new); + } else { + old = cpu_to_le64(old); + new = cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { - u64 va_top, va_bottom, baddr, desc; + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; int level, stride, ret; level = wi->sl; @@ -393,7 +409,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, va_top = get_ia_size(wi) - 1; while (1) { - u64 index, ipa; + u64 index; va_bottom = (3 - level) * stride + wi->pgshift; index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); @@ -438,6 +454,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return ret; } + new_desc = desc; + /* Invalid descriptor */ if (!(desc & BIT(0))) goto transfault; @@ -490,6 +508,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + if (!(desc & PTE_AF)) { fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); return -EACCES; From e4c7dfac2f1ab848ffd356b3e76827ea404bbd94 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:55 -0800 Subject: [PATCH 251/260] KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW Give the stage-2 walk similar treatment to stage-1: update the access flag during the table walk and do so for any walk context. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-14-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 5 +++++ arch/arm64/kvm/nested.c | 44 ++++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 96f1786c72fe..b9aebca90f59 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2012,6 +2012,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) u32 esr; ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret == -EAGAIN) { + ret = 1; + goto out_unlock; + } + if (ret) { esr = kvm_s2_trans_esr(&nested_trans); kvm_inject_s2_fault(vcpu, esr); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index a096766c6ec3..6495442f400a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,12 +124,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - u64 baddr; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int sl; - unsigned int t0sz; - bool be; + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; + bool ha; }; static u32 compute_fsc(int level, u32 fsc) @@ -219,6 +220,20 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, return 0; } +static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new, + struct s2_walk_info *wi) +{ + if (wi->be) { + old = cpu_to_be64(old); + new = cpu_to_be64(new); + } else { + old = cpu_to_le64(old); + new = cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + /* * This is essentially a C-version of the pseudo code from the ARM ARM * AArch64.TranslationTableWalk function. I strongly recommend looking at @@ -232,7 +247,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, int first_block_level, level, stride, input_size, base_lower_bound; phys_addr_t base_addr; unsigned int addr_top, addr_bottom; - u64 desc; /* page table entry */ + u64 desc, new_desc; /* page table entry */ int ret; phys_addr_t paddr; @@ -281,6 +296,8 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, if (ret < 0) return ret; + new_desc = desc; + /* Check for valid descriptor at this point */ if (!(desc & KVM_PTE_VALID)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); @@ -325,6 +342,17 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 1; } + if (wi->ha) + new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + + if (new_desc != desc) { + ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); out->desc = desc; @@ -363,6 +391,8 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); + + wi->ha = vtcr & VTCR_EL2_HA; } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, From d5bbb76f447420681abdcfa4ad32344d11188d00 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:56 -0800 Subject: [PATCH 252/260] KVM: arm64: nv: Expose hardware access flag management to NV guests Everything is in place to update the access flag at S1 and S2. Expose support for the access flag flavor of FEAT_HAFDBS to NV guests. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-15-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 6495442f400a..88d7dfb44410 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1599,11 +1599,13 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) case SYS_ID_AA64MMFR1_EL1: val &= ~(ID_AA64MMFR1_EL1_CMOW | ID_AA64MMFR1_EL1_nTLBPA | - ID_AA64MMFR1_EL1_ETS | - ID_AA64MMFR1_EL1_HAFDBS); + ID_AA64MMFR1_EL1_ETS); + /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) val &= ~ID_AA64MMFR1_EL1_VH; + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF); break; case SYS_ID_AA64MMFR2_EL1: From 66f188858385d640163fbf866d9c11b7741da91a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:57 -0800 Subject: [PATCH 253/260] KVM: arm64: selftests: Add test for AT emulation Add a basic test for AT emulation in the EL2&0 and EL1&0 translation regimes. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-16-oupton@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/arm64/at.c | 166 ++++++++++++++++++ .../testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++ 4 files changed, 178 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/at.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..81b3aa54678a 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -156,6 +156,7 @@ TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases +TEST_GEN_PROGS_arm64 += arm64/at TEST_GEN_PROGS_arm64 += arm64/debug-exceptions TEST_GEN_PROGS_arm64 += arm64/hello_el2 TEST_GEN_PROGS_arm64 += arm64/host_sve diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c new file mode 100644 index 000000000000..acecb6ab5071 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * at - Test for KVM's AT emulation in the EL2&0 and EL1&0 translation regimes. + */ +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall.h" + +#include + +#define TEST_ADDR 0x80000000 + +enum { + CLEAR_ACCESS_FLAG, + TEST_ACCESS_FLAG, +}; + +static u64 *ptep_hva; + +#define copy_el2_to_el1(reg) \ + write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12) + +/* Yes, this is an ugly hack */ +#define __at(op, addr) write_sysreg_s(addr, op) + +#define test_at_insn(op, expect_fault) \ +do { \ + u64 par, fsc; \ + bool fault; \ + \ + GUEST_SYNC(CLEAR_ACCESS_FLAG); \ + \ + __at(OP_AT_##op, TEST_ADDR); \ + isb(); \ + par = read_sysreg(par_el1); \ + \ + fault = par & SYS_PAR_EL1_F; \ + fsc = FIELD_GET(SYS_PAR_EL1_FST, par); \ + \ + __GUEST_ASSERT((expect_fault) == fault, \ + "AT "#op": %sexpected fault (par: %lx)1", \ + (expect_fault) ? "" : "un", par); \ + if ((expect_fault)) { \ + __GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \ + "AT "#op": expected access flag fault (par: %lx)", \ + par); \ + } else { \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ + GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ + GUEST_SYNC(TEST_ACCESS_FLAG); \ + } \ +} while (0) + +static void test_at(bool expect_fault) +{ + test_at_insn(S1E2R, expect_fault); + test_at_insn(S1E2W, expect_fault); + + /* Reuse the stage-1 MMU context from EL2 at EL1 */ + copy_el2_to_el1(SCTLR); + copy_el2_to_el1(MAIR); + copy_el2_to_el1(TCR); + copy_el2_to_el1(TTBR0); + copy_el2_to_el1(TTBR1); + + /* Disable stage-2 translation and enter a non-host context */ + write_sysreg(0, vtcr_el2); + write_sysreg(0, vttbr_el2); + sysreg_clear_set(hcr_el2, HCR_EL2_TGE | HCR_EL2_VM, 0); + isb(); + + test_at_insn(S1E1R, expect_fault); + test_at_insn(S1E1W, expect_fault); +} + +static void guest_code(void) +{ + sysreg_clear_set(tcr_el1, TCR_HA, 0); + isb(); + + test_at(true); + + if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) + GUEST_DONE(); + + /* + * KVM's software PTW makes the implementation choice that the AT + * instruction sets the access flag. + */ + sysreg_clear_set(tcr_el1, 0, TCR_HA); + isb(); + test_at(false); + + GUEST_DONE(); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + switch (uc->args[1]) { + case CLEAR_ACCESS_FLAG: + /* + * Delete + reinstall the memslot to invalidate stage-2 + * mappings of the stage-1 page tables, forcing KVM to + * use the 'slow' AT emulation path. + * + * This and clearing the access flag from host userspace + * ensures that the access flag cannot be set speculatively + * and is reliably cleared at the time of the AT instruction. + */ + clear_bit(__ffs(PTE_AF), ptep_hva); + vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); + break; + case TEST_ACCESS_FLAG: + TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), + "Expected access flag to be set (desc: %lu)", *ptep_hva); + break; + default: + TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); + } +} + +static void run_test(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + return; + case UCALL_SYNC: + handle_sync(vcpu, &uc); + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + default: + TEST_FAIL("Unexpeced ucall: %lu", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu_init init; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2)); + + vm = vm_create(1); + + kvm_get_default_vcpu_target(vm, &init); + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2); + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code); + kvm_arch_vm_finalize_vcpus(vm); + + virt_map(vm, TEST_ADDR, TEST_ADDR, 1); + ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3); + run_test(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..41467dad9178 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -715,6 +715,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) #endif void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..d6538bb17740 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1201,6 +1201,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot) +{ + struct userspace_mem_region *region = memslot2region(vm, slot); + struct kvm_userspace_memory_region2 tmp = region->region; + + tmp.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); +} + /* * VM Memory Region Move * From 36fe022f884bb936d911a0d2e93819aba11daceb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 15:54:09 -0800 Subject: [PATCH 254/260] KVM: arm64: Fix compilation when CONFIG_ARM64_USE_LSE_ATOMICS=n __lse_swap_desc() is compiled unconditionally, even if LSE is disabled using the config option. Align with the spirit of the config option and fix some build errors due to __LSE_PREAMBLE being undefined with the application of some ifdeffery. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511250700.kAutzJFm-lkp@intel.com/ Link: https://msgid.link/20251124235409.1731253-1-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index e39f814d247f..f774a02d9393 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1697,6 +1697,7 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } } +#ifdef CONFIG_ARM64_LSE_ATOMICS static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) { u64 tmp = old; @@ -1721,6 +1722,12 @@ static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) return ret; } +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) { From b0fc8329ec98ea891a5e47821db6aee1d564bff6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Nov 2025 20:48:48 +0000 Subject: [PATCH 255/260] KVM: arm64: Add endian casting to kvm_swap_s[12]_desc() Keep sparse quiet by explicitly casting endianness conversion when swapping S1 and S2 descriptors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511260246.JQDGsQKa-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202511260344.9XehvH5Q-lkp@intel.com/ Fixes: c59ca4b5b0c3f ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Fixes: 39db933ba67f8 ("KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW") Signed-off-by: Marc Zyngier Link: https://msgid.link/20251125204848.1136383-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 8 ++++---- arch/arm64/kvm/nested.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f774a02d9393..d25fef0f66e2 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -386,11 +386,11 @@ static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, struct s1_walk_info *wi) { if (wi->be) { - old = cpu_to_be64(old); - new = cpu_to_be64(new); + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); } else { - old = cpu_to_le64(old); - new = cpu_to_le64(new); + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); } return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 88d7dfb44410..911fc99ed99d 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -224,11 +224,11 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6 struct s2_walk_info *wi) { if (wi->be) { - old = cpu_to_be64(old); - new = cpu_to_be64(new); + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); } else { - old = cpu_to_le64(old); - new = cpu_to_le64(new); + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); } return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); From d98a04dc190672bbd29654a159f7621e0c63adcf Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 25 Nov 2025 10:59:15 -0700 Subject: [PATCH 256/260] KVM: arm64: Add break to default case in kvm_pgtable_stage2_pte_prot() Clang warns (or errors with CONFIG_WERROR=y / W=e): arch/arm64/kvm/hyp/pgtable.c:757:2: error: label at end of compound statement is a C23 extension [-Werror,-Wc23-extensions] 757 | } | ^ With older versions of clang (15 and older) and GCC (at least the minimum supported, 8.1), this is an unconditional hard error: arch/arm64/kvm/hyp/pgtable.c: In function 'kvm_pgtable_stage2_pte_prot': arch/arm64/kvm/hyp/pgtable.c:756:2: error: label at end of compound statement default: ^~~~~~~ arch/arm64/kvm/hyp/pgtable.c:756:10: error: label at end of compound statement: expected statement default: ^ ; Add a break statement to this default case to clear up the error/warning. Fixes: 2608563b466b ("KVM: arm64: Add support for FEAT_XNX stage-2 permissions") Signed-off-by: Nathan Chancellor Acked-by: Marc Zyngier Link: https://msgid.link/20251125-arm64-kvm-hyp-pgtable-fix-c23-ext-warn-v1-1-98b506ddefbf@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index e1d75f965027..5a1cf42df4c5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -754,6 +754,7 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_PX; break; default: + break; } return prot; From 05474b7bc75d215a147b44b339ba4e9638b74382 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 28 Nov 2025 17:51:24 +0000 Subject: [PATCH 257/260] KVM: arm64: Fix spelling mistake "Unexpeced" -> "Unexpected" There is a spelling mistake in a TEST_FAIL message. Fix it. Signed-off-by: Colin Ian King Link: https://msgid.link/20251128175124.319094-1-colin.i.king@gmail.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/at.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c index acecb6ab5071..c8ee6f520734 100644 --- a/tools/testing/selftests/kvm/arm64/at.c +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -137,7 +137,7 @@ static void run_test(struct kvm_vcpu *vcpu) REPORT_GUEST_ASSERT(uc); return; default: - TEST_FAIL("Unexpeced ucall: %lu", uc.cmd); + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); } } } From 93e8d997812b315bbec946e874171a8b7d785eaf Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:43 +0000 Subject: [PATCH 258/260] KVM: arm64: Document KVM_PGTABLE_PROT_{UX,PX} Commit 2608563b466b ("KVM: arm64: Add support for FEAT_XNX stage-2 permissions") added the KVM_PGTABLE_PROX_{UX,PX} permissions to stage 2 and to EL2 translation regimes, but left them undocumented. Let's fix that. Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-2-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c72149a607d6..611e62331763 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -240,7 +240,9 @@ enum kvm_pgtable_stage2_flags { /** * enum kvm_pgtable_prot - Page-table permissions and attributes. - * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_UX: Unprivileged execute permission. + * @KVM_PGTABLE_PROT_PX: Privileged execute permission. + * @KVM_PGTABLE_PROT_X: Privileged and unprivileged execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. From e88d60c0aa0abbaade177b84f54a868c67231cd7 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:44 +0000 Subject: [PATCH 259/260] KVM: arm64: at: Use correct HA bit in TCR_EL2 when regime is EL2 According to ARM DDI 0487L.b, the HA bit in TCR_EL2 when the translation regime is EL2 (or !ELIsInHost(EL2)) is bit 21, not 39. Fixes: c59ca4b5b0c3 ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-3-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/at.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1da290aeedce..e500600e4b9b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -111,6 +111,7 @@ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_HPD (1 << 24) +#define TCR_EL2_HA (1 << 21) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index d25fef0f66e2..6d41a95f6c60 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,7 +346,9 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); - wi->ha = tcr & TCR_HA; + wi->ha = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); return 0; From d52aca1635654805a682afef176473793a63b588 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:46 +0000 Subject: [PATCH 260/260] KVM: arm64: at: Update AF on software walk only if VM has FEAT_HAFDBS A guest can write 1 to TCR_ELx.HA, making the KVM software walker update the access flag in a table descriptor even if FEAT_HAFDBS is not present. Avoid this by making wi->ha depend on FEAT_HAFDBS being enabled in the VM, similar to how the software walker treats FEAT_HPDS. This is not needed for VTCR_EL2.HA, since a guest will always write to the in-memory copy of the register, where the HA bit is masked (set to 0) by KVM if the VM doesn't have FEAT_HAFDBS. Fixes: c59ca4b5b0c3 ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-5-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 6d41a95f6c60..53bf70126f81 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,7 +346,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); - wi->ha = (wi->regime == TR_EL2 ? + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_HA, tcr) : FIELD_GET(TCR_HA, tcr));