Arm:

- Invalidate nested MMUs upon freeing the PGD to avoid WARNs when visiting from an MMU notifier - Fixes to the TLB match process and TLB invalidation range for managing the VCNR pseudo-TLB - Prevent SPE from erroneously profiling guests due to UNKNOWN reset values in PMSCR_EL1 - Fix save/restore of host MDCR_EL2 to account for eagerly programming at vcpu_load() on VHE systems - Correct lock ordering when dealing with VGIC LPIs, avoiding scenarios where an xarray's spinlock was nested with a *raw* spinlock - Permit stage-2 read permission aborts which are possible in the case of NV depending on the guest hypervisor's stage-2 translation - Call raw_spin_unlock() instead of the internal spinlock API - Fix parameter ordering when assigning VBAR_EL1 - Reverted a couple of fixes for RCU stalls when destroying a stage-2 page table. There appears to be some nasty refcounting / UAF issues lurking in those patches and the band-aid we tried to apply didn't hold. s390: - mm fixes, including userfaultfd bug fix x86: - Sync the vTPR from the local APIC to the VMCB even when AVIC is active. This fixes a bug where host updates to the vTPR, e.g. via KVM_SET_LAPIC or emulation of a guest access, are lost and result in interrupt delivery issues in the guest. -----BEGIN PGP SIGNATURE----- iQFIBAABCgAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmjK9ngUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroN16Af+LVx1/VtMOv60W9FuM92SYfmDbW4N onLEykJnMKH1+MwDgOYcfz8lKl2Q6alcWNpjaBkNkwpulVonXg6/GLkrN1PE6V5C i1ZeKW0bqhegtx/E/lYAOBeXXQsssiX0wVxKMSw14ICG29AL47fIJgWCIdz9nMc3 itbUbWR0qa3N1o1AcCaWNhLUUWU+niXaWmDWaJtGfG7to9a2ylfCUnxej9JNuQEC nXkfsCfWvgvpuChpi1DUOlVHcePSWIqc8wrIUkmv6eR8Olvr/PZtUBXD0Y2YOyL6 q7UVZSK19A3a1KYVKEcCLlDxjLONIUnsXaqFbJE2aCcJJbB+tDhEhAluBw== =UP8p -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm fixes from Paolo Bonzini: "These are mostly Oliver's Arm changes: lock ordering fixes for the vGIC, and reverts for a buggy attempt to avoid RCU stalls on large VMs. Arm: - Invalidate nested MMUs upon freeing the PGD to avoid WARNs when visiting from an MMU notifier - Fixes to the TLB match process and TLB invalidation range for managing the VCNR pseudo-TLB - Prevent SPE from erroneously profiling guests due to UNKNOWN reset values in PMSCR_EL1 - Fix save/restore of host MDCR_EL2 to account for eagerly programming at vcpu_load() on VHE systems - Correct lock ordering when dealing with VGIC LPIs, avoiding scenarios where an xarray's spinlock was nested with a *raw* spinlock - Permit stage-2 read permission aborts which are possible in the case of NV depending on the guest hypervisor's stage-2 translation - Call raw_spin_unlock() instead of the internal spinlock API - Fix parameter ordering when assigning VBAR_EL1 - Reverted a couple of fixes for RCU stalls when destroying a stage-2 page table. There appears to be some nasty refcounting / UAF issues lurking in those patches and the band-aid we tried to apply didn't hold. s390: - mm fixes, including userfaultfd bug fix x86: - Sync the vTPR from the local APIC to the VMCB even when AVIC is active. This fixes a bug where host updates to the vTPR, e.g. via KVM_SET_LAPIC or emulation of a guest access, are lost and result in interrupt delivery issues in the guest" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: SVM: Sync TPR from LAPIC into VMCB::V_TPR even if AVIC is active Revert "KVM: arm64: Split kvm_pgtable_stage2_destroy()" Revert "KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables" KVM: arm64: vgic: fix incorrect spinlock API usage KVM: arm64: Remove stage 2 read fault check KVM: arm64: Fix parameter ordering for VBAR_EL1 assignment KVM: arm64: nv: Fix incorrect VNCR invalidation range calculation KVM: arm64: vgic-v3: Indicate vgic_put_irq() may take LPI xarray lock KVM: arm64: vgic-v3: Don't require IRQs be disabled for LPI xarray lock KVM: arm64: vgic-v3: Erase LPIs from xarray outside of raw spinlocks KVM: arm64: Spin off release helper from vgic_put_irq() KVM: arm64: vgic-v3: Use bare refcount for VGIC LPIs KVM: arm64: vgic: Drop stale comment on IRQ active state KVM: arm64: VHE: Save and restore host MDCR_EL2 value correctly KVM: arm64: Initialize PMSCR_EL1 when in VHE KVM: arm64: nv: fix VNCR TLB ASID match logic for non-Global entries KVM: s390: Fix FOLL_*/FAULT_FLAG_* confusion KVM: s390: Fix incorrect usage of mmu_notifier_register() KVM: s390: Fix access to unavailable adapter indicator pages during postcopy KVM: arm64: Mark freed S2 MMUs as invalid
2025-09-18 09:42:55 -07:00 · 2025-09-18 09:42:55 -07:00 · 86cc796e5e
parent 604530cd9a ecd42dd170
commit 86cc796e5e
23 changed files with 155 additions and 177 deletions
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@ -1369,6 +1369,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
 }
 void kvm_init_host_debug_data(void);
 void kvm_debug_init_vhe(void);
 void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu);
 void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu);
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@ -355,11 +355,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
 	return pteref;
 }
 static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
 {
 	return pteref;
 }
 static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
 {
 	/*
@ -389,11 +384,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
 	return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
 }
 static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
 {
 	return rcu_dereference_raw(pteref);
 }
 static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
 {
 	if (walker->flags & KVM_PGTABLE_WALK_SHARED)
@ -561,26 +551,6 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2
 */
 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 /**
 * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses.
 * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:      Intermediate physical address at which to place the mapping.
 * @size:      Size of the mapping.
 *
 * The page-table is assumed to be unreachable by any hardware walkers prior
 * to freeing and therefore no TLB invalidation is performed.
 */
 void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
 					u64 addr, u64 size);
 /**
 * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table.
 * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
 *
 * It is assumed that the rest of the page-table is freed before this operation.
 */
 void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
 /**
 * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
 * @mm_ops:	Memory management callbacks.
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@ -179,9 +179,7 @@ struct pkvm_mapping {
 int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			     struct kvm_pgtable_mm_ops *mm_ops);
-void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 					u64 addr, u64 size);
 void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
 			    enum kvm_pgtable_prot prot, void *mc,
 			    enum kvm_pgtable_walk_flags flags);
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@ -2113,8 +2113,10 @@ static void cpu_hyp_init_features(void)
 {
 	cpu_set_hyp_vector();
-	if (is_kernel_in_hyp_mode())
+	if (is_kernel_in_hyp_mode()) {
 		kvm_timer_init_vhe();
 		kvm_debug_init_vhe();
 	}
 	if (vgic_present)
 		kvm_vgic_init_cpu_hardware();
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@ -96,6 +96,13 @@ void kvm_init_host_debug_data(void)
 	}
 }
 void kvm_debug_init_vhe(void)
 {
 	/* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */
 	if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1)))
 		write_sysreg_el1(0, SYS_PMSCR);
 }
 /*
 * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
 * has taken over MDSCR_EL1.
@ -138,6 +145,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
 	/* Must be called before kvm_vcpu_load_vhe() */
 	KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);
 	if (has_vhe())
 		*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
 	/*
 	 * Determine which of the possible debug states we're in:
 	 *
@ -184,6 +194,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
 {
 	if (has_vhe())
 		write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
 	if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
 		return;
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 		vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
 	}
 	*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
 	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 	if (cpus_have_final_cap(ARM64_HAS_HCX)) {
 		u64 hcrx = vcpu->arch.hcrx_el2;
 		if (is_nested_ctxt(vcpu)) {
@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
 	write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
 	write_sysreg(0, hstr_el2);
 	if (system_supports_pmuv3()) {
 		write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	___activate_traps(vcpu, vcpu->arch.hcr_el2);
 	*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
 	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 	__activate_traps_common(vcpu);
 	__activate_cptr_traps(vcpu);
@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 		isb();
 	}
 	write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
 	__deactivate_traps_common(vcpu);
 	write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@ -253,7 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
 	*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
 	*vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
-	__vcpu_assign_sys_reg(vcpu, read_sysreg_el1(SYS_VBAR), VBAR_EL1);
+	__vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR));
 	kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@ -1551,38 +1551,21 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	return 0;
 }
-void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 				       u64 addr, u64 size)
 {
 	size_t pgd_sz;
 	struct kvm_pgtable_walker walker = {
 		.cb	= stage2_free_walker,
 		.flags	= KVM_PGTABLE_WALK_LEAF |
 			  KVM_PGTABLE_WALK_TABLE_POST,
 	};
-	WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
 }
 void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
 {
 	size_t pgd_sz;
 	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
-
+	pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
 	/*
 	 * Since the pgtable is unlinked at this point, and not shared with
 	 * other walkers, safely deference pgd with kvm_dereference_pteref_raw()
 	 */
 	pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz);
 	pgt->pgd = NULL;
 }
 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 {
 	kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
 	kvm_pgtable_stage2_destroy_pgd(pgt);
 }
 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
 {
 	kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@ -904,38 +904,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
 	return 0;
 }
 /*
 * Assume that @pgt is valid and unlinked from the KVM MMU to free the
 * page-table without taking the kvm_mmu_lock and without performing any
 * TLB invalidations.
 *
 * Also, the range of addresses can be large enough to cause need_resched
 * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
 * cond_resched() periodically to prevent hogging the CPU for a long time
 * and schedule something else, if required.
 */
 static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
 				   phys_addr_t end)
 {
 	u64 next;
 	do {
 		next = stage2_range_addr_end(addr, end);
 		KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
 								next - addr);
 		if (next != end)
 			cond_resched();
 	} while (addr = next, addr != end);
 }
 static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
 {
 	unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
 	stage2_destroy_range(pgt, 0, BIT(ia_bits));
 	KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
 }
 /**
 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
 * @kvm:	The pointer to the KVM structure
@ -1012,7 +980,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	return 0;
 out_destroy_pgtable:
-	kvm_stage2_destroy(pgt);
+	KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
 out_free_pgtable:
 	kfree(pgt);
 	return err;
@ -1106,10 +1074,14 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 		mmu->pgt = NULL;
 		free_percpu(mmu->last_vcpu_ran);
 	}
 	if (kvm_is_nested_s2_mmu(kvm, mmu))
 		kvm_init_nested_s2_mmu(mmu);
 	write_unlock(&kvm->mmu_lock);
 	if (pgt) {
-		kvm_stage2_destroy(pgt);
+		KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
 		kfree(pgt);
 	}
 }
@ -1541,11 +1513,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
 	VM_BUG_ON(write_fault && exec_fault);
 	if (fault_is_perm && !write_fault && !exec_fault) {
 		kvm_err("Unexpected L2 read permission error\n");
 		return -EFAULT;
 	}
 	if (!is_protected_kvm_enabled())
 		memcache = &vcpu->arch.mmu_page_cache;
 	else
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
 		ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
 							    vt->wr.level));
-		ipa_start = vt->wr.pa & (ipa_size - 1);
+		ipa_start = vt->wr.pa & ~(ipa_size - 1);
 		ipa_end = ipa_start + ipa_size;
 		if (ipa_end <= start || ipa_start >= end)
@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm,
 		va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
 							   vt->wr.level));
-		va_start = vt->gva & (va_size - 1);
+		va_start = vt->gva & ~(va_size - 1);
 		va_end = va_start + va_size;
 		switch (scope->type) {
@ -1276,7 +1276,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
 		    !(tcr & TCR_ASID16))
 			asid &= GENMASK(7, 0);
-		return asid != vt->wr.asid;
+		return asid == vt->wr.asid;
 	}
 	return true;
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@ -316,16 +316,9 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
 	return 0;
 }
-void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 					u64 addr, u64 size)
 {
-	__pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+	__pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
 }
 void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
 {
 	/* Expected to be called after all pKVM mappings have been released. */
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
 }
 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm)
 	int nr_lpis = 0;
 	xa_for_each(&dist->lpi_xa, intid, irq) {
-		if (!vgic_try_get_irq_kref(irq))
+		if (!vgic_try_get_irq_ref(irq))
 			continue;
 		xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
-	xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
+	xa_init(&dist->lpi_xa);
 }
 /* CREATION */
@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 		raw_spin_lock_init(&irq->irq_lock);
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu0;
-		kref_init(&irq->refcount);
+		refcount_set(&irq->refcount, 0);
 		switch (dist->vgic_model) {
 		case KVM_DEV_TYPE_ARM_VGIC_V2:
 			irq->targets = 0;
@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
 		irq->intid = i;
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu;
-		kref_init(&irq->refcount);
+		refcount_set(&irq->refcount, 0);
 		if (vgic_irq_is_sgi(i)) {
 			/* SGIs */
 			irq->enabled = 1;
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
 	unsigned long flags;
 	int ret;
 	/* In this case there is no put, since we keep the reference. */
@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	if (!irq)
 		return ERR_PTR(-ENOMEM);
-	ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+	ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
 	if (ret) {
 		kfree(irq);
 		return ERR_PTR(ret);
@ -99,19 +98,19 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	raw_spin_lock_init(&irq->irq_lock);
 	irq->config = VGIC_CONFIG_EDGE;
-	kref_init(&irq->refcount);
+	refcount_set(&irq->refcount, 1);
 	irq->intid = intid;
 	irq->target_vcpu = vcpu;
 	irq->group = 1;
-	xa_lock_irqsave(&dist->lpi_xa, flags);
+	xa_lock(&dist->lpi_xa);
 	/*
 	 * There could be a race with another vgic_add_lpi(), so we need to
 	 * check that we don't add a second list entry with the same LPI.
 	 */
 	oldirq = xa_load(&dist->lpi_xa, intid);
-	if (vgic_try_get_irq_kref(oldirq)) {
+	if (vgic_try_get_irq_ref(oldirq)) {
 		/* Someone was faster with adding this LPI, lets use that. */
 		kfree(irq);
 		irq = oldirq;
@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	}
 out_unlock:
-	xa_unlock_irqrestore(&dist->lpi_xa, flags);
+	xa_unlock(&dist->lpi_xa);
 	if (ret)
 		return ERR_PTR(ret);
@ -547,7 +546,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
 	rcu_read_lock();
 	irq = xa_load(&its->translation_cache, cache_key);
-	if (!vgic_try_get_irq_kref(irq))
+	if (!vgic_try_get_irq_ref(irq))
 		irq = NULL;
 	rcu_read_unlock();
@ -571,7 +570,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 	 * its_lock, as the ITE (and the reference it holds) cannot be freed.
 	 */
 	lockdep_assert_held(&its->its_lock);
-	vgic_get_irq_kref(irq);
+	vgic_get_irq_ref(irq);
 	old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
 		if (!irq->hw || irq->host_irq != host_irq)
 			continue;
-		if (!vgic_try_get_irq_kref(irq))
+		if (!vgic_try_get_irq_ref(irq))
 			return NULL;
 		return irq;
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
 *     kvm->arch.config_lock (mutex)
 *       its->cmd_lock (mutex)
 *         its->its_lock (mutex)
- *           vgic_cpu->ap_list_lock		must be taken with IRQs disabled
+ *           vgic_dist->lpi_xa.xa_lock
- *             vgic_dist->lpi_xa.xa_lock	must be taken with IRQs disabled
+ *             vgic_cpu->ap_list_lock		must be taken with IRQs disabled
 *               vgic_irq->irq_lock		must be taken with IRQs disabled
 *
 * As the ap_list_lock might be taken from the timer interrupt handler,
@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 	rcu_read_lock();
 	irq = xa_load(&dist->lpi_xa, intid);
-	if (!vgic_try_get_irq_kref(irq))
+	if (!vgic_try_get_irq_ref(irq))
 		irq = NULL;
 	rcu_read_unlock();
@ -114,37 +114,66 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
 	return vgic_get_irq(vcpu->kvm, intid);
 }
-/*
+static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq)
 * We can't do anything in here, because we lack the kvm pointer to
 * lock and remove the item from the lpi_list. So we keep this function
 * empty and use the return value of kref_put() to trigger the freeing.
 */
 static void vgic_irq_release(struct kref *ref)
 {
 	lockdep_assert_held(&dist->lpi_xa.xa_lock);
 	__xa_erase(&dist->lpi_xa, irq->intid);
 	kfree_rcu(irq, rcu);
 }
 static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
 	if (irq->intid < VGIC_MIN_LPI)
 		return false;
 	return refcount_dec_and_test(&irq->refcount);
 }
 static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq)
 {
 	if (!__vgic_put_irq(kvm, irq))
 		return false;
 	irq->pending_release = true;
 	return true;
 }
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	unsigned long flags;
-	if (irq->intid < VGIC_MIN_LPI)
+	if (irq->intid >= VGIC_MIN_LPI)
 		might_lock(&dist->lpi_xa.xa_lock);
 	if (!__vgic_put_irq(kvm, irq))
 		return;
-	if (!kref_put(&irq->refcount, vgic_irq_release))
+	xa_lock(&dist->lpi_xa);
-		return;
+	vgic_release_lpi_locked(dist, irq);
 	xa_unlock(&dist->lpi_xa);
 }
-	xa_lock_irqsave(&dist->lpi_xa, flags);
+static void vgic_release_deleted_lpis(struct kvm *kvm)
-	__xa_erase(&dist->lpi_xa, irq->intid);
+{
-	xa_unlock_irqrestore(&dist->lpi_xa, flags);
+	struct vgic_dist *dist = &kvm->arch.vgic;
 	unsigned long intid;
 	struct vgic_irq *irq;
-	kfree_rcu(irq, rcu);
+	xa_lock(&dist->lpi_xa);
 	xa_for_each(&dist->lpi_xa, intid, irq) {
 		if (irq->pending_release)
 			vgic_release_lpi_locked(dist, irq);
 	}
 	xa_unlock(&dist->lpi_xa);
 }
 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq, *tmp;
 	bool deleted = false;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
@ -155,11 +184,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
 			list_del(&irq->ap_list);
 			irq->vcpu = NULL;
 			raw_spin_unlock(&irq->irq_lock);
-			vgic_put_irq(vcpu->kvm, irq);
+			deleted |= vgic_put_irq_norelease(vcpu->kvm, irq);
 		}
 	}
 	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
 	if (deleted)
 		vgic_release_deleted_lpis(vcpu->kvm);
 }
 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
@ -399,7 +431,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 	 * now in the ap_list. This is safe as the caller must already hold a
 	 * reference on the irq.
 	 */
-	vgic_get_irq_kref(irq);
+	vgic_get_irq_ref(irq);
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
 	irq->vcpu = vcpu;
@ -630,6 +662,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq, *tmp;
 	bool deleted_lpis = false;
 	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@ -657,12 +690,12 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 			/*
 			 * This vgic_put_irq call matches the
-			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
+			 * vgic_get_irq_ref in vgic_queue_irq_unlock,
 			 * where we added the LPI to the ap_list. As
 			 * we remove the irq from the list, we drop
 			 * also drop the refcount.
 			 */
-			vgic_put_irq(vcpu->kvm, irq);
+			deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq);
 			continue;
 		}
@ -725,6 +758,9 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 	}
 	raw_spin_unlock(&vgic_cpu->ap_list_lock);
 	if (unlikely(deleted_lpis))
 		vgic_release_deleted_lpis(vcpu->kvm);
 }
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@ -818,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 		 * the AP list has been sorted already.
 		 */
 		if (multi_sgi && irq->priority > prio) {
-			_raw_spin_unlock(&irq->irq_lock);
+			raw_spin_unlock(&irq->irq_lock);
 			break;
 		}
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu);
 void vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
+static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq)
 {
 	if (!irq)
 		return false;
@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
 	if (irq->intid < VGIC_MIN_LPI)
 		return true;
-	return kref_get_unless_zero(&irq->refcount);
+	return refcount_inc_not_zero(&irq->refcount);
 }
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+static inline void vgic_get_irq_ref(struct vgic_irq *irq)
 {
-	WARN_ON_ONCE(!vgic_try_get_irq_kref(irq));
+	WARN_ON_ONCE(!vgic_try_get_irq_ref(irq));
 }
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@ -2778,12 +2778,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
 static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
 {
 	struct mm_struct *mm = kvm->mm;
 	struct page *page = NULL;
 	int locked = 1;
 	if (mmget_not_zero(mm)) {
 		mmap_read_lock(mm);
 		get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
 				      &page, &locked);
 		if (locked)
 			mmap_read_unlock(mm);
 		mmput(mm);
 	}
 	mmap_read_lock(kvm->mm);
 	get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
 			      &page, NULL);
 	mmap_read_unlock(kvm->mm);
 	return page;
 }
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@ -4864,12 +4864,12 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
 * @vcpu: the vCPU whose gmap is to be fixed up
 * @gfn: the guest frame number used for memslots (including fake memslots)
 * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
- * @flags: FOLL_* flags
+ * @foll: FOLL_* flags
 *
 * Return: 0 on success, < 0 in case of error.
 * Context: The mm lock must not be held before calling. May sleep.
 */
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags)
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll)
 {
 	struct kvm_memory_slot *slot;
 	unsigned int fault_flags;
@ -4883,13 +4883,13 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return vcpu_post_run_addressing_exception(vcpu);
-	fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
+	fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
 	if (vcpu->arch.gmap->pfault_enabled)
-		flags |= FOLL_NOWAIT;
+		foll |= FOLL_NOWAIT;
 	vmaddr = __gfn_to_hva_memslot(slot, gfn);
 try_again:
-	pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page);
+	pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page);
 	/* Access outside memory, inject addressing exception */
 	if (is_noslot_pfn(pfn))
@ -4905,7 +4905,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u
 			return 0;
 		vcpu->stat.pfault_sync++;
 		/* Could not setup async pfault, try again synchronously */
-		flags &= ~FOLL_NOWAIT;
+		foll &= ~FOLL_NOWAIT;
 		goto try_again;
 	}
 	/* Any other error */
@ -4925,7 +4925,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u
 	return rc;
 }
-static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags)
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll)
 {
 	unsigned long gaddr_tmp;
 	gfn_t gfn;
@ -4950,18 +4950,18 @@ static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, un
 		}
 		gfn = gpa_to_gfn(gaddr_tmp);
 	}
-	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags);
+	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll);
 }
 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 {
-	unsigned int flags = 0;
+	unsigned int foll = 0;
 	unsigned long gaddr;
 	int rc;
 	gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
 	if (kvm_s390_cur_gmap_fault_is_write())
-		flags = FAULT_FLAG_WRITE;
+		foll = FOLL_WRITE;
 	switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
 	case 0:
@ -5003,7 +5003,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 			send_sig(SIGSEGV, current, 0);
 		if (rc != -ENXIO)
 			break;
-		flags = FAULT_FLAG_WRITE;
+		foll = FOLL_WRITE;
 		fallthrough;
 	case PGM_PROTECTION:
 	case PGM_SEGMENT_TRANSLATION:
@ -5013,7 +5013,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 	case PGM_REGION_SECOND_TRANS:
 	case PGM_REGION_THIRD_TRANS:
 		kvm_s390_assert_primary_as(vcpu);
-		return vcpu_dat_fault_handler(vcpu, gaddr, flags);
+		return vcpu_dat_fault_handler(vcpu, gaddr, foll);
 	default:
 		KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 			current->thread.gmap_int_code, current->thread.gmap_teid.val);
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@ -624,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	int cc, ret;
 	u16 dummy;
 	/* Add the notifier only once. No races because we hold kvm->lock */
 	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
 		/* The notifier will be unregistered when the VM is destroyed */
 		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
 		ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
 		if (ret) {
 			kvm->arch.pv.mmu_notifier.ops = NULL;
 			return ret;
 		}
 	}
 	ret = kvm_s390_pv_alloc_vm(kvm);
 	if (ret)
 		return ret;
@ -659,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EIO;
 	}
 	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
 	/* Add the notifier only once. No races because we hold kvm->lock */
 	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
 		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
 		mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
 	}
 	return 0;
 }
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@ -4046,8 +4046,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
-	if (nested_svm_virtualize_tpr(vcpu) ||
+	if (nested_svm_virtualize_tpr(vcpu))
 	    kvm_vcpu_apicv_active(vcpu))
 		return;
 	cr8 = kvm_get_cr8(vcpu);
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@ -8,8 +8,8 @@
 #include <linux/bits.h>
 #include <linux/kvm.h>
 #include <linux/irqreturn.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/refcount.h>
 #include <linux/spinlock.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
@ -139,10 +139,13 @@ struct vgic_irq {
 	bool pending_latch;		/* The pending latch state used to calculate
 					 * the pending state for both level
 					 * and edge triggered IRQs. */
-	bool active;			/* not used for LPIs */
+	bool active;
 	bool pending_release;		/* Used for LPIs only, unreferenced IRQ
 					 * pending a release */
 	bool enabled;
 	bool hw;			/* Tied to HW IRQ */
-	struct kref refcount;		/* Used for LPIs */
+	refcount_t refcount;		/* Used for LPIs */
 	u32 hwintid;			/* HW INTID number */
 	unsigned int host_irq;		/* linux irq corresponding to hwintid */
 	union {