From e72753ed12670bdf599d5a07066c861c62d40ae8 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Thu, 17 Apr 2025 14:43:59 +0200 Subject: [PATCH 01/10] KVM: s390: Use ESCA instead of BSCA at VM init All modern IBM Z and Linux One machines do offer support for the Extended System Control Area (ESCA). The ESCA is available since the z114/z196 released in 2010. KVM needs to allocate and manage the SCA for guest VMs. Prior to this change the SCA was setup as Basic SCA only supporting a maximum of 64 vCPUs when initializing the VM. With addition of the 65th vCPU the SCA was needed to be converted to a ESCA. Instead of allocating a BSCA and upgrading it for PV or when adding the 65th cpu we can always allocate the ESCA directly upon VM creation simplifying the code in multiple places as well as completely removing the need to convert an existing SCA. In cases where the ESCA is not supported (z10 and earlier) the use of the SCA entries and with that SIGP interpretation are disabled for VMs. This increases the number of exits from the VM in multiprocessor scenarios and thus decreases performance. The same is true for VSIE where SIGP is currently disabled and thus no SCA entries are used. The only downside of the change is that we will always allocate 4 pages for a 248 cpu ESCA instead of a single page for the BSCA per VM. In return we can delete a bunch of checks and special handling depending on the SCA type as well as the whole BSCA to ESCA conversion. With that behavior change we are no longer referencing a bsca_block in kvm->arch.sca. This will always be esca_block instead. By specifying the type of the sca as esca_block we can simplify access to the sca and get rid of some helpers while making the code clearer. KVM_MAX_VCPUS is also moved to kvm_host_types to allow using this in future type definitions. Reviewed-by: Janosch Frank Signed-off-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 5 +- arch/s390/kvm/gaccess.c | 10 +- arch/s390/kvm/interrupt.c | 78 +++++--------- arch/s390/kvm/kvm-s390.c | 170 +++++++------------------------ arch/s390/kvm/kvm-s390.h | 9 +- 5 files changed, 67 insertions(+), 205 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c2ba3d4398c5..3cf14dd75409 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -631,9 +631,8 @@ struct kvm_s390_pv { struct mmu_notifier mmu_notifier; }; -struct kvm_arch{ - void *sca; - int use_esca; +struct kvm_arch { + struct esca_block *sca; rwlock_t sca_lock; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 21c2e61fece4..3651ab682fd7 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -113,7 +113,7 @@ int ipte_lock_held(struct kvm *kvm) int rc; read_lock(&kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + rc = kvm->arch.sca->ipte_control.kh != 0; read_unlock(&kvm->arch.sca_lock); return rc; } @@ -130,7 +130,7 @@ static void ipte_lock_simple(struct kvm *kvm) goto out; retry: read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { @@ -155,7 +155,7 @@ static void ipte_unlock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count) goto out; read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; @@ -173,7 +173,7 @@ static void ipte_lock_siif(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { @@ -193,7 +193,7 @@ static void ipte_unlock_siif(struct kvm *kvm) union ipte_control old, new, *ic; read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c62a868cf2b6..36394ba897f5 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -45,6 +45,8 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { + union esca_sigp_ctrl sigp_ctrl; + struct esca_block *sca; int c, scn; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) @@ -52,21 +54,11 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; + sca = vcpu->kvm->arch.sca; + sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } + c = sigp_ctrl.c; + scn = sigp_ctrl.scn; read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) @@ -77,37 +69,23 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { + union esca_sigp_ctrl old_val, new_val = {0}; + union esca_sigp_ctrl *sigp_ctrl; + struct esca_block *sca; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val; + sca = vcpu->kvm->arch.sca; + sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; + old_val = READ_ONCE(*sigp_ctrl); + new_val.scn = src_id; + new_val.c = 1; + old_val.c = 0; - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val; - - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; - - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); read_unlock(&vcpu->kvm->arch.sca_lock); if (rc != expect) { @@ -120,23 +98,17 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { + union esca_sigp_ctrl *sigp_ctrl; + struct esca_block *sca; + if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); + sca = vcpu->kvm->arch.sca; + sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; - WRITE_ONCE(sigp_ctrl->value, 0); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - - WRITE_ONCE(sigp_ctrl->value, 0); - } + WRITE_ONCE(sigp_ctrl->value, 0); read_unlock(&vcpu->kvm->arch.sca_lock); } @@ -1224,7 +1196,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - if (!sclp.has_sigpif) + if (!kvm_s390_use_sca_entries()) return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); return sca_ext_call_pending(vcpu, NULL); @@ -1549,7 +1521,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) + if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..78468b96d250 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -272,7 +272,6 @@ debug_info_t *kvm_s390_dbf_uv; /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); -static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -632,11 +631,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: - r = KVM_S390_BSCA_CPU_SLOTS; + /* + * Return the same value for KVM_CAP_MAX_VCPUS and + * KVM_CAP_MAX_VCPU_ID to conform with the KVM API. + */ + r = KVM_S390_ESCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; - else if (sclp.has_esca && sclp.has_64bscao) - r = KVM_S390_ESCA_CPU_SLOTS; if (ext == KVM_CAP_NR_VCPUS) r = min_t(unsigned int, num_online_cpus(), r); break; @@ -1931,13 +1932,11 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) * Updates the Multiprocessor Topology-Change-Report bit to signal * the guest with a topology change. * This is only relevant if the topology facility is present. - * - * The SCA version, bsca or esca, doesn't matter as offset is the same. */ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) { union sca_utility new, old; - struct bsca_block *sca; + struct esca_block *sca; read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; @@ -1968,7 +1967,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, return -ENXIO; read_lock(&kvm->arch.sca_lock); - topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + topo = kvm->arch.sca->utility.mtcr; read_unlock(&kvm->arch.sca_lock); return put_user(topo, (u8 __user *)attr->addr); @@ -2667,14 +2666,6 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (kvm_s390_pv_is_protected(kvm)) break; - /* - * FMT 4 SIE needs esca. As we never switch back to bsca from - * esca, we need no cleanup in the error cases below - */ - r = sca_switch_to_extended(kvm); - if (r) - break; - mmap_write_lock(kvm->mm); r = gmap_helper_disable_cow_sharing(); mmap_write_unlock(kvm->mm); @@ -3317,10 +3308,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) static void sca_dispose(struct kvm *kvm) { - if (kvm->arch.use_esca) - free_pages_exact(kvm->arch.sca, sizeof(struct esca_block)); - else - free_page((unsigned long)(kvm->arch.sca)); + free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca)); kvm->arch.sca = NULL; } @@ -3334,10 +3322,9 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; - int i, rc; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO; char debug_name[16]; - static unsigned long sca_offset; + int i, rc; rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL @@ -3359,17 +3346,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; rwlock_init(&kvm->arch.sca_lock); - /* start with basic SCA */ - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + mutex_lock(&kvm_lock); + + kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); + mutex_unlock(&kvm_lock); if (!kvm->arch.sca) goto out_err; - mutex_lock(&kvm_lock); - sca_offset += 16; - if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) - sca_offset = 0; - kvm->arch.sca = (struct bsca_block *) - ((char *) kvm->arch.sca + sca_offset); - mutex_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); @@ -3548,27 +3530,25 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca; + if (!kvm_s390_use_sca_entries()) return; read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; + sca = vcpu->kvm->arch.sca; - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } + clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; read_unlock(&vcpu->kvm->arch.sca_lock); } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca; + phys_addr_t sca_phys; + if (!kvm_s390_use_sca_entries()) { - phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); + sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ vcpu->arch.sie_block->scaoh = sca_phys >> 32; @@ -3576,105 +3556,23 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu) return; } read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); + sca = vcpu->kvm->arch.sca; + sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - } + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); read_unlock(&vcpu->kvm->arch.sca_lock); } -/* Basic SCA to Extended SCA data copy routines */ -static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s) -{ - d->sda = s->sda; - d->sigp_ctrl.c = s->sigp_ctrl.c; - d->sigp_ctrl.scn = s->sigp_ctrl.scn; -} - -static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s) -{ - int i; - - d->ipte_control = s->ipte_control; - d->mcn[0] = s->mcn; - for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++) - sca_copy_entry(&d->cpu[i], &s->cpu[i]); -} - -static int sca_switch_to_extended(struct kvm *kvm) -{ - struct bsca_block *old_sca = kvm->arch.sca; - struct esca_block *new_sca; - struct kvm_vcpu *vcpu; - unsigned long vcpu_idx; - u32 scaol, scaoh; - phys_addr_t new_sca_phys; - - if (kvm->arch.use_esca) - return 0; - - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!new_sca) - return -ENOMEM; - - new_sca_phys = virt_to_phys(new_sca); - scaoh = new_sca_phys >> 32; - scaol = new_sca_phys & ESCA_SCAOL_MASK; - - kvm_s390_vcpu_block_all(kvm); - write_lock(&kvm->arch.sca_lock); - - sca_copy_b_to_e(new_sca, old_sca); - - kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { - vcpu->arch.sie_block->scaoh = scaoh; - vcpu->arch.sie_block->scaol = scaol; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - } - kvm->arch.sca = new_sca; - kvm->arch.use_esca = 1; - - write_unlock(&kvm->arch.sca_lock); - kvm_s390_vcpu_unblock_all(kvm); - - free_page((unsigned long)old_sca); - - VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)", - old_sca, kvm->arch.sca); - return 0; -} - static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { - int rc; + if (!kvm_s390_use_sca_entries()) + return id < KVM_MAX_VCPUS; - if (!kvm_s390_use_sca_entries()) { - if (id < KVM_MAX_VCPUS) - return true; - return false; - } - if (id < KVM_S390_BSCA_CPU_SLOTS) - return true; - if (!sclp.has_esca || !sclp.has_64bscao) - return false; - - rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - - return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; + return id < KVM_S390_ESCA_CPU_SLOTS; } /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ @@ -3920,7 +3818,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= ECA_IB; if (sclp.has_siif) vcpu->arch.sie_block->eca |= ECA_SII; - if (sclp.has_sigpif) + if (kvm_s390_use_sca_entries()) vcpu->arch.sie_block->eca |= ECA_SIGPI; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= ECA_VX; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c44fe0c3a097..65c950760993 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -570,13 +570,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); -/* support for Basic/Extended SCA handling */ -static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) -{ - struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */ - - return &sca->ipte_control; -} static inline int kvm_s390_use_sca_entries(void) { /* @@ -584,7 +577,7 @@ static inline int kvm_s390_use_sca_entries(void) * might use the entries. By not setting the entries and keeping them * invalid, hardware will not access them but intercept. */ - return sclp.has_sigpif; + return sclp.has_sigpif && sclp.has_esca; } void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct mcck_volatile_info *mcck_info); From 14542a0a54c5c84eebd9255e26cce9b1c15d9571 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Thu, 5 Jun 2025 18:14:05 +0200 Subject: [PATCH 02/10] KVM: S390: Remove sca_lock Since we are no longer switching from a BSCA to a ESCA we can completely get rid of the sca_lock. The write lock was only taken for that conversion. After removal of the lock some local code cleanups are possible. Signed-off-by: Christoph Schlameuss Suggested-by: Janosch Frank [frankja@linux.ibm.com: Added suggested-by tag as discussed on list] Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/gaccess.c | 19 ++--------------- arch/s390/kvm/interrupt.c | 36 ++++++++------------------------ arch/s390/kvm/kvm-s390.c | 34 ++++++++---------------------- 4 files changed, 20 insertions(+), 70 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3cf14dd75409..22cedcaea475 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -633,7 +633,6 @@ struct kvm_s390_pv { struct kvm_arch { struct esca_block *sca; - rwlock_t sca_lock; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; struct kvm_device *flic; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 3651ab682fd7..41ca6b0ee7a9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -109,14 +109,9 @@ struct aste { int ipte_lock_held(struct kvm *kvm) { - if (sclp.has_siif) { - int rc; + if (sclp.has_siif) + return kvm->arch.sca->ipte_control.kh != 0; - read_lock(&kvm->arch.sca_lock); - rc = kvm->arch.sca->ipte_control.kh != 0; - read_unlock(&kvm->arch.sca_lock); - return rc; - } return kvm->arch.ipte_lock_count != 0; } @@ -129,19 +124,16 @@ static void ipte_lock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); } @@ -154,14 +146,12 @@ static void ipte_unlock_simple(struct kvm *kvm) kvm->arch.ipte_lock_count--; if (kvm->arch.ipte_lock_count) goto out; - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -172,12 +162,10 @@ static void ipte_lock_siif(struct kvm *kvm) union ipte_control old, new, *ic; retry: - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -185,14 +173,12 @@ static void ipte_lock_siif(struct kvm *kvm) new.k = 1; new.kh++; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&kvm->arch.sca_lock); ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { @@ -201,7 +187,6 @@ static void ipte_unlock_siif(struct kvm *kvm) if (!new.kh) new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 36394ba897f5..220d9d00c23d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -45,48 +45,34 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { - union esca_sigp_ctrl sigp_ctrl; - struct esca_block *sca; - int c, scn; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) return 0; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) - *src_id = scn; + *src_id = sigp_ctrl.scn; - return c; + return sigp_ctrl.c; } static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { - union esca_sigp_ctrl old_val, new_val = {0}; - union esca_sigp_ctrl *sigp_ctrl; - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1}; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; old_val.c = 0; expect = old_val.value; rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - read_unlock(&vcpu->kvm->arch.sca_lock); if (rc != expect) { /* another external call is pending */ @@ -98,18 +84,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - union esca_sigp_ctrl *sigp_ctrl; - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; WRITE_ONCE(sigp_ctrl->value, 0); - read_unlock(&vcpu->kvm->arch.sca_lock); } int psw_extint_disabled(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 78468b96d250..769820e3a243 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1938,14 +1938,12 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) union sca_utility new, old; struct esca_block *sca; - read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; old = READ_ONCE(sca->utility); do { new = old; new.mtcr = val; } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static int kvm_s390_set_topo_change_indication(struct kvm *kvm, @@ -1966,9 +1964,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, if (!test_kvm_facility(kvm, 11)) return -ENXIO; - read_lock(&kvm->arch.sca_lock); topo = kvm->arch.sca->utility.mtcr; - read_unlock(&kvm->arch.sca_lock); return put_user(topo, (u8 __user *)attr->addr); } @@ -3345,7 +3341,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; - rwlock_init(&kvm->arch.sca_lock); mutex_lock(&kvm_lock); kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); @@ -3530,41 +3525,30 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { - struct esca_block *sca; + struct esca_block *sca = vcpu->kvm->arch.sca; if (!kvm_s390_use_sca_entries()) return; - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); sca->cpu[vcpu->vcpu_id].sda = 0; - read_unlock(&vcpu->kvm->arch.sca_lock); } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { - struct esca_block *sca; - phys_addr_t sca_phys; + struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - if (!kvm_s390_use_sca_entries()) { - sca_phys = virt_to_phys(vcpu->kvm->arch.sca); - - /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - return; - } - read_lock(&vcpu->kvm->arch.sca_lock); - sca = vcpu->kvm->arch.sca; - sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + /* we still need the sca header for the ipte control */ vcpu->arch.sie_block->scaoh = sca_phys >> 32; vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; + + if (!kvm_s390_use_sca_entries()) + return; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); - read_unlock(&vcpu->kvm->arch.sca_lock); + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); } static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) From 7d5136ed1b218f7d68e15ff7a6d000a7ff3cce0f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Sep 2025 15:12:40 +0200 Subject: [PATCH 03/10] KVM: s390: Remove unused return variable in kvm_arch_vcpu_ioctl_set_fpu kvm_arch_vcpu_ioctl_set_fpu() always returns 0 and the local return variable 'ret' is not used anymore. Remove it. Signed-off-by: Thorsten Blum Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 769820e3a243..677aa5c7d226 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4249,8 +4249,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - int ret = 0; - vcpu_load(vcpu); vcpu->run->s.regs.fpc = fpu->fpc; @@ -4261,7 +4259,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu_put(vcpu); - return ret; + return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) From f5a6fa189ad2e73d25e31de3cc7f0bf81907c986 Mon Sep 17 00:00:00 2001 From: Josephine Pfeiffer Date: Wed, 1 Oct 2025 19:40:46 +0200 Subject: [PATCH 04/10] KVM: s390: Replace sprintf with snprintf for buffer safety Replace sprintf() with snprintf() when formatting debug names to prevent potential buffer overflow. The debug_name buffer is 16 bytes, and while unlikely to overflow with current PIDs, using snprintf() provides proper bounds checking. Signed-off-by: Josephine Pfeiffer [frankja@linux.ibm.com: Fixed subject prefix] Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 677aa5c7d226..70ebc54b1bb1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3348,7 +3348,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sca) goto out_err; - sprintf(debug_name, "kvm-%u", current->pid); + snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid); kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) From 182a258b5ec4a45170e776d3a0c0bccfc4fab998 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Wed, 13 Aug 2025 11:04:31 +0000 Subject: [PATCH 05/10] Documentation: kvm: Fix ordering 7.43 has been assigned twice, make KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 7.44. Fixes: f55ce5a6cd33 ("KVM: arm64: Expose new KVM cap for cacheable PFNMAP") Reviewed-by: Ankit Agrawal Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..72b2fae99a83 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8692,7 +8692,7 @@ given VM. When this capability is enabled, KVM resets the VCPU when setting MP_STATE_INIT_RECEIVED through IOCTL. The original MP_STATE is preserved. -7.43 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED +7.44 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED ------------------------------------------- :Architectures: arm64 From 8e8678e740ecde2ae4a0404fd9b4ed2b726e236d Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 8 Jul 2025 12:57:57 +0000 Subject: [PATCH 06/10] KVM: s390: Add capability that forwards operation exceptions Setting KVM_CAP_S390_USER_OPEREXEC will forward all operation exceptions to user space. This also includes the 0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. It's helpful if user space wants to emulate instructions which do not (yet) have an opcode. While we're at it refine the documentation for KVM_CAP_S390_USER_INSTR0. Signed-off-by: Janosch Frank Reviewed-by: Claudio Imbrenda Acked-by: Christian Borntraeger Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 17 ++- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/intercept.c | 3 + arch/s390/kvm/kvm-s390.c | 7 + include/uapi/linux/kvm.h | 1 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/s390/user_operexec.c | 140 ++++++++++++++++++ 7 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/s390/user_operexec.c diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 72b2fae99a83..1bc2a84c59ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7820,7 +7820,7 @@ where 0xff represents CPUs 0-7 in cluster 0. :Architectures: s390 :Parameters: none -With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +With this capability enabled, the illegal instruction 0x0000 (2 bytes) will be intercepted and forwarded to user space. User space can use this mechanism e.g. to realize 2-byte software breakpoints. The kernel will not inject an operating exception for these instructions, user space has @@ -8703,6 +8703,21 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_S390_USER_OPEREXEC +------------------------------- + +:Architectures: s390 +:Parameters: none + +When this capability is enabled KVM forwards all operation exceptions +that it doesn't handle itself to user space. This also includes the +0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. This is +helpful if user space wants to emulate instructions which are not +(yet) implemented in hardware. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + 8. Other capabilities. ====================== diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 22cedcaea475..1e4829c70216 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -648,6 +648,7 @@ struct kvm_arch { int user_sigp; int user_stsi; int user_instr0; + int user_operexec; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index c7908950c1f4..420ae62977e2 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -471,6 +471,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0xb256) return handle_sthyi(vcpu); + if (vcpu->kvm->arch.user_operexec) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 70ebc54b1bb1..56d4730b7c41 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -606,6 +606,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_S390_USER_OPEREXEC: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -921,6 +922,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_USER_OPEREXEC: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC"); + kvm->arch.user_operexec = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..8ab07396ce3b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -963,6 +963,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_S390_USER_OPEREXEC 245 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..87e429206bb8 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -194,6 +194,7 @@ TEST_GEN_PROGS_s390 += s390/debug_test TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test +TEST_GEN_PROGS_s390 += s390/user_operexec TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c new file mode 100644 index 000000000000..714906c1d12a --- /dev/null +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test operation exception forwarding. + * + * Copyright IBM Corp. 2025 + * + * Authors: + * Janosch Frank + */ +#include "kselftest.h" +#include "kvm_util.h" +#include "test_util.h" +#include "sie.h" + +#include + +static void guest_code_instr0(void) +{ + asm(".word 0x0000"); +} + +static void test_user_instr0(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +static void guest_code_user_operexec(void) +{ + asm(".word 0x0807"); +} + +static void test_user_operexec(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* + * Since user_operexec is the superset it can be used for the + * 0 instruction. + */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +/* combine user_instr0 and user_operexec */ +static void test_user_operexec_combined(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* Reverse enablement order */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +/* + * Run all tests above. + * + * Enablement after VCPU has been added is automatically tested since + * we enable the capability after VCPU creation. + */ +static struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "instr0", test_user_instr0 }, + { "operexec", test_user_operexec }, + { "operexec_combined", test_user_operexec_combined}, +}; + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_USER_INSTR0)); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + ksft_finished(); +} From 44acac00be5dbda58f337acda41148d39743075c Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 7 Nov 2025 03:49:27 +0100 Subject: [PATCH 07/10] KVM: s390: vsie: Check alignment of BSCA header The VSIE code currently checks that the BSCA struct fits within a page, and returns a validity exception 0x003b if it doesn't. The BSCA is pinned in memory rather than shadowed (see block comment at end of kvm_s390_cpu_feat_init()), so enforcing the CPU entries to be on the same pinned page makes sense. Except those entries aren't going to be used below the guest, and according to the definition of that validity exception only the header of the BSCA (everything but the CPU entries) needs to be within a page. Adjust the alignment check to account for that. Signed-off-by: Eric Farman Reviewed-by: Christian Borntraeger Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 347268f89f2f..d23ab5120888 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -782,7 +782,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) rc = set_validity_icpt(scb_s, 0x0011U); else if ((gpa & PAGE_MASK) != - ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK)) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); From c067847c52e26eceed9f8a938c04456880c486fa Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 26 Nov 2025 16:33:10 +1100 Subject: [PATCH 08/10] KVM: s390: Add signal_exits counter Add a signal_exits counter for s390, as exists on arm64, loongarch, mips, powerpc, riscv and x86. This is used by kvm_handle_signal_exit(), which we will use when we later enable CONFIG_VIRT_XFER_TO_GUEST_WORK. Signed-off-by: Andrew Donnellan Reviewed-by: Janosch Frank Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1e4829c70216..ae1223264d3c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -146,6 +146,7 @@ struct kvm_vcpu_stat { u64 instruction_diagnose_500; u64 instruction_diagnose_other; u64 pfault_sync; + u64 signal_exits; }; #define PGM_OPERATION 0x01 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 56d4730b7c41..8db37e508a71 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -185,7 +185,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), - STATS_DESC_COUNTER(VCPU, pfault_sync) + STATS_DESC_COUNTER(VCPU, pfault_sync), + STATS_DESC_COUNTER(VCPU, signal_exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -5251,6 +5252,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->stat.signal_exits++; rc = -EINTR; } From d0139059e31acd5fea49737558297d801c406638 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Nov 2025 16:33:11 +1100 Subject: [PATCH 09/10] KVM: s390: Enable and disable interrupts in entry code Move enabling and disabling of interrupts around the SIE instruction to entry code. Enabling interrupts only after the __TI_sie flag has been set guarantees that the SIE instruction is not executed if an interrupt happens between enabling interrupts and the execution of the SIE instruction. Interrupt handlers and machine check handler forward the PSW to the sie_exit label in such cases. This is a prerequisite for VIRT_XFER_TO_GUEST_WORK to prevent that guest context is entered when e.g. a scheduler IPI, indicating that a reschedule is required, happens right before the SIE instruction, which could lead to long delays. Signed-off-by: Heiko Carstens Tested-by: Andrew Donnellan Signed-off-by: Andrew Donnellan Reviewed-by: Janosch Frank Signed-off-by: Janosch Frank --- arch/s390/include/asm/stacktrace.h | 1 + arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 2 ++ arch/s390/kvm/kvm-s390.c | 5 ----- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 810a6b9d9628..c9ae680a28af 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -66,6 +66,7 @@ struct stack_frame { unsigned long sie_flags; unsigned long sie_control_block_phys; unsigned long sie_guest_asce; + unsigned long sie_irq; }; }; unsigned long gprs[10]; diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index a8915663e917..730449f464af 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -64,6 +64,7 @@ int main(void) OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); + OFFSET(__SF_SIE_IRQ, stack_frame, sie_irq); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 75b0fbb236d0..e906f4ab6cf3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -189,6 +189,7 @@ SYM_FUNC_START(__sie64a) mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 mvi __TI_sie(%r14),1 + stosm __SF_SIE_IRQ(%r15),0x03 # enable interrupts lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now @@ -212,6 +213,7 @@ SYM_FUNC_START(__sie64a) lg %r14,__LC_CURRENT(%r14) mvi __TI_sie(%r14),0 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) + stnsm __SF_SIE_IRQ(%r15),0xfc # disable interrupts lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8db37e508a71..4d13601ec217 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4962,13 +4962,8 @@ int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, * The guest_state_{enter,exit}_irqoff() functions inform lockdep and * tracing that entry to the guest will enable host IRQs, and exit from * the guest will disable host IRQs. - * - * We must not use lockdep/tracing/RCU in this critical section, so we - * use the low-level arch_local_irq_*() helpers to enable/disable IRQs. */ - arch_local_irq_enable(); ret = sie64a(scb, gprs, gasce); - arch_local_irq_disable(); guest_state_exit_irqoff(); From 2bd1337a1295e012e60008ee21a64375e5234e12 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 26 Nov 2025 16:33:12 +1100 Subject: [PATCH 10/10] KVM: s390: Use generic VIRT_XFER_TO_GUEST_WORK functions Switch to using the generic infrastructure to check for and handle pending work before transitioning into guest mode. xfer_to_guest_mode_handle_work() does a few more things than the current code does when deciding whether or not to exit the __vcpu_run() loop. The exittime tests from kvm-unit-tests, in my tests, were within a few percent compared to before this series, which is within noise tolerance. Co-developed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Andrew Donnellan Acked-by: Janosch Frank [frankja@linux.ibm.com: Removed semicolon] Signed-off-by: Janosch Frank --- arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/kvm-s390.c | 25 ++++++++++++++++++------- arch/s390/kvm/vsie.c | 18 +++++++++++++----- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..0ca9d6587243 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -30,6 +30,7 @@ config KVM select HAVE_KVM_NO_POLL select KVM_VFIO select MMU_NOTIFIER + select VIRT_XFER_TO_GUEST_WORK help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4d13601ec217..d31155e371df 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include +#include #include #include #include @@ -4675,9 +4676,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14]; vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15]; - if (need_resched()) - schedule(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); if (rc || guestdbg_exit_pending(vcpu)) @@ -4982,12 +4980,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) */ kvm_vcpu_srcu_read_lock(vcpu); - do { + while (true) { rc = vcpu_pre_run(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc || guestdbg_exit_pending(vcpu)) break; - kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between * guest_timing_enter_irqoff and guest_timing_exit_irqoff @@ -4999,7 +4997,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) sizeof(sie_page->pv_grregs)); } +xfer_to_guest_mode_check: local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + break; + goto xfer_to_guest_mode_check; + } + guest_timing_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); @@ -5029,9 +5037,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); - } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); + if (rc || guestdbg_exit_pending(vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); + break; + } + } - kvm_vcpu_srcu_read_unlock(vcpu); return rc; } diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d23ab5120888..b526621d2a1b 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1180,12 +1180,23 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_int_code = 0; barrier(); if (!kvm_s390_vcpu_sie_inhibited(vcpu)) { +xfer_to_guest_mode_check: local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + goto skip_sie; + goto xfer_to_guest_mode_check; + } guest_timing_enter_irqoff(); rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); guest_timing_exit_irqoff(); local_irq_enable(); } + +skip_sie: barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; @@ -1345,13 +1356,11 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * but rewind the PSW to re-enter SIE once that's completed * instead of passing a "no action" intercept to the guest. */ - if (signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0) || + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); break; } - cond_resched(); } if (rc == -EFAULT) { @@ -1483,8 +1492,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) { + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); return 0; }