Merge branch 'guest-memfd-mmap' into HEAD

Add support for host userspace mapping of guest_memfd-backed memory for VM
types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE (which isn't
precisely the same thing as CoCo VMs, since x86's SEV-MEM and SEV-ES have
no way to detect private vs. shared).

mmap() support paves the way for several evolving KVM use cases:

* Allows VMMs like Firecracker to run guests entirely backed by
  guest_memfd [1]. This provides a unified memory management model for
  both confidential and non-confidential guests, simplifying VMM design.

* Enhanced Security via direct map removal: When combined with Patrick's
  series for direct map removal [2], this provides additional hardening
  against Spectre-like transient execution attacks by eliminating the
  need for host kernel direct maps of guest memory.

* Lays the groundwork for *restricted* mmap() support for guest_memfd-backed
  memory on CoCo platforms [3] that permit in-place sharing of guest memory
   with the host.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini 2025-08-27 04:37:40 -04:00
commit a6ad54137a
26 changed files with 648 additions and 214 deletions

View File

@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
guest_memfd range is not allowed (any number of memory regions can be bound to guest_memfd range is not allowed (any number of memory regions can be bound to
a single guest_memfd file, but the bound ranges must not overlap). a single guest_memfd file, but the bound ranges must not overlap).
When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation
enables mmap() and faulting of guest_memfd memory to host userspace.
When the KVM MMU performs a PFN lookup to service a guest fault and the backing
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
consumed from guest_memfd, regardless of whether it is a shared or a private
fault.
See KVM_SET_USER_MEMORY_REGION2 for additional details. See KVM_SET_USER_MEMORY_REGION2 for additional details.
4.143 KVM_PRE_FAULT_MEMORY 4.143 KVM_PRE_FAULT_MEMORY

View File

@ -37,6 +37,7 @@ menuconfig KVM
select HAVE_KVM_VCPU_RUN_PID_CHANGE select HAVE_KVM_VCPU_RUN_PID_CHANGE
select SCHED_INFO select SCHED_INFO
select GUEST_PERF_EVENTS if PERF_EVENTS select GUEST_PERF_EVENTS if PERF_EVENTS
select KVM_GUEST_MEMFD
help help
Support hosting virtualized guest machines. Support hosting virtualized guest machines.

View File

@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
} }
} }
static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
void **memcache)
{
int min_pages;
if (!is_protected_kvm_enabled())
*memcache = &vcpu->arch.mmu_page_cache;
else
*memcache = &vcpu->arch.pkvm_memcache;
if (!topup_memcache)
return 0;
min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
return kvm_mmu_topup_memory_cache(*memcache, min_pages);
return topup_hyp_memcache(*memcache, min_pages);
}
/*
* Potentially reduce shadow S2 permissions to match the guest's own S2. For
* exec faults, we'd only reach this point if the guest actually allowed it (see
* kvm_s2_handle_perm_fault).
*
* Also encode the level of the original translation in the SW bits of the leaf
* entry as a proxy for the span of that translation. This will be retrieved on
* TLB invalidation from the guest and used to limit the invalidation scope if a
* TTL hint or a range isn't provided.
*/
static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
enum kvm_pgtable_prot *prot,
bool *writable)
{
*writable &= kvm_s2_trans_writable(nested);
if (!kvm_s2_trans_readable(nested))
*prot &= ~KVM_PGTABLE_PROT_R;
*prot |= kvm_encode_nested_level(nested);
}
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, bool is_perm)
{
bool write_fault, exec_fault, writable;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
unsigned long mmu_seq;
struct page *page;
struct kvm *kvm = vcpu->kvm;
void *memcache;
kvm_pfn_t pfn;
gfn_t gfn;
int ret;
ret = prepare_mmu_memcache(vcpu, true, &memcache);
if (ret)
return ret;
if (nested)
gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
else
gfn = fault_ipa >> PAGE_SHIFT;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_WARN_ON_ONCE(write_fault && exec_fault);
mmu_seq = kvm->mmu_invalidate_seq;
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
smp_rmb();
ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
if (ret) {
kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
write_fault, exec_fault, false);
return ret;
}
writable = !(memslot->flags & KVM_MEM_READONLY);
if (nested)
adjust_nested_fault_perms(nested, &prot, &writable);
if (writable)
prot |= KVM_PGTABLE_PROT_W;
if (exec_fault ||
(cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
(!nested || kvm_s2_trans_executable(nested))))
prot |= KVM_PGTABLE_PROT_X;
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
ret = -EAGAIN;
goto out_unlock;
}
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
out_unlock:
kvm_release_faultin_page(kvm, page, !!ret, writable);
kvm_fault_unlock(kvm);
if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
return ret != -EAGAIN ? ret : 0;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_s2_trans *nested, struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva, struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm) bool fault_is_perm)
{ {
int ret = 0; int ret = 0;
bool write_fault, writable, force_pte = false; bool topup_memcache;
bool write_fault, writable;
bool exec_fault, mte_allowed, is_vma_cacheable; bool exec_fault, mte_allowed, is_vma_cacheable;
bool s2_force_noncacheable = false, vfio_allow_any_uc = false; bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
unsigned long mmu_seq; unsigned long mmu_seq;
@ -1495,28 +1614,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn; gfn_t gfn;
kvm_pfn_t pfn; kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot); bool logging_active = memslot_is_logging(memslot);
bool force_pte = logging_active;
long vma_pagesize, fault_granule; long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt; struct kvm_pgtable *pgt;
struct page *page; struct page *page;
vm_flags_t vm_flags; vm_flags_t vm_flags;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
if (fault_is_perm) if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu); write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault); VM_WARN_ON_ONCE(write_fault && exec_fault);
if (fault_is_perm && !write_fault && !exec_fault) {
kvm_err("Unexpected L2 read permission error\n");
return -EFAULT;
}
if (!is_protected_kvm_enabled())
memcache = &vcpu->arch.mmu_page_cache;
else
memcache = &vcpu->arch.pkvm_memcache;
/* /*
* Permission faults just need to update the existing leaf entry, * Permission faults just need to update the existing leaf entry,
@ -1524,17 +1634,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* only exception to this is when dirty logging is enabled at runtime * only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table. * and a write fault needs to collapse a block entry into a table.
*/ */
if (!fault_is_perm || (logging_active && write_fault)) { topup_memcache = !fault_is_perm || (logging_active && write_fault);
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
if (ret)
if (!is_protected_kvm_enabled()) return ret;
ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
else
ret = topup_hyp_memcache(memcache, min_pages);
if (ret)
return ret;
}
/* /*
* Let's check if we will get back a huge page backed by hugetlbfs, or * Let's check if we will get back a huge page backed by hugetlbfs, or
@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT; return -EFAULT;
} }
/* if (force_pte)
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT; vma_shift = PAGE_SHIFT;
} else { else
vma_shift = get_vma_page_shift(vma, hva); vma_shift = get_vma_page_shift(vma, hva);
}
switch (vma_shift) { switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED #ifndef __PAGETABLE_PMD_FOLDED
@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
max_map_size = PAGE_SIZE; max_map_size = PAGE_SIZE;
force_pte = (max_map_size == PAGE_SIZE); force_pte = (max_map_size == PAGE_SIZE);
vma_pagesize = min(vma_pagesize, (long)max_map_size); vma_pagesize = min_t(long, vma_pagesize, max_map_size);
} }
/* /*
@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* with the smp_wmb() in kvm_mmu_invalidate_end(). * with the smp_wmb() in kvm_mmu_invalidate_end().
*/ */
mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmu_seq = kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm); mmap_read_unlock(current->mm);
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && s2_force_noncacheable) if (exec_fault && s2_force_noncacheable)
return -ENOEXEC; return -ENOEXEC;
/* if (nested)
* Potentially reduce shadow S2 permissions to match the guest's own adjust_nested_fault_perms(nested, &prot, &writable);
* S2. For exec faults, we'd only reach this point if the guest
* actually allowed it (see kvm_s2_handle_perm_fault).
*
* Also encode the level of the original translation in the SW bits
* of the leaf entry as a proxy for the span of that translation.
* This will be retrieved on TLB invalidation from the guest and
* used to limit the invalidation scope if a TTL hint or a range
* isn't provided.
*/
if (nested) {
writable &= kvm_s2_trans_writable(nested);
if (!kvm_s2_trans_readable(nested))
prot &= ~KVM_PGTABLE_PROT_R;
prot |= kvm_encode_nested_level(nested);
}
kvm_fault_lock(kvm); kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt; pgt = vcpu->arch.hw_mmu->pgt;
@ -1981,8 +2062,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock; goto out_unlock;
} }
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
esr_fsc_is_permission_fault(esr)); !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
if (kvm_slot_has_gmem(memslot))
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
esr_fsc_is_permission_fault(esr));
else
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
if (ret == 0) if (ret == 0)
ret = 1; ret = 1;
out: out:
@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT; return -EFAULT;
/*
* Only support guest_memfd backed memslots with mappable memory, since
* there aren't any CoCo VMs that support only private memory on arm64.
*/
if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
return -EINVAL;
hva = new->userspace_addr; hva = new->userspace_addr;
reg_end = hva + (new->npages << PAGE_SHIFT); reg_end = hva + (new->npages << PAGE_SHIFT);

View File

@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
} }
static int kvm_translate_vncr(struct kvm_vcpu *vcpu) static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
{ {
struct kvm_memory_slot *memslot;
bool write_fault, writable; bool write_fault, writable;
unsigned long mmu_seq; unsigned long mmu_seq;
struct vncr_tlb *vt; struct vncr_tlb *vt;
@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
smp_rmb(); smp_rmb();
gfn = vt->wr.pa >> PAGE_SHIFT; gfn = vt->wr.pa >> PAGE_SHIFT;
pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page); memslot = gfn_to_memslot(vcpu->kvm, gfn);
if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) if (!memslot)
return -EFAULT; return -EFAULT;
*is_gmem = kvm_slot_has_gmem(memslot);
if (!*is_gmem) {
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
&writable, &page);
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
return -EFAULT;
} else {
ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
if (ret) {
kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
write_fault, false, false);
return ret;
}
}
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
return -EAGAIN; return -EAGAIN;
@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
if (esr_fsc_is_permission_fault(esr)) { if (esr_fsc_is_permission_fault(esr)) {
inject_vncr_perm(vcpu); inject_vncr_perm(vcpu);
} else if (esr_fsc_is_translation_fault(esr)) { } else if (esr_fsc_is_translation_fault(esr)) {
bool valid; bool valid, is_gmem = false;
int ret; int ret;
scoped_guard(read_lock, &vcpu->kvm->mmu_lock) scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
valid = kvm_vncr_tlb_lookup(vcpu); valid = kvm_vncr_tlb_lookup(vcpu);
if (!valid) if (!valid)
ret = kvm_translate_vncr(vcpu); ret = kvm_translate_vncr(vcpu, &is_gmem);
else else
ret = -EPERM; ret = -EPERM;
switch (ret) { switch (ret) {
case -EAGAIN: case -EAGAIN:
case -ENOMEM:
/* Let's try again... */ /* Let's try again... */
break; break;
case -ENOMEM:
/*
* For guest_memfd, this indicates that it failed to
* create a folio to back the memory. Inform userspace.
*/
if (is_gmem)
return 0;
/* Otherwise, let's try again... */
break;
case -EFAULT: case -EFAULT:
case -EIO:
case -EHWPOISON:
if (is_gmem)
return 0;
fallthrough;
case -EINVAL: case -EINVAL:
case -ENOENT: case -ENOENT:
case -EACCES: case -EACCES:

View File

@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
KVM_X86_OP_OPTIONAL(gmem_invalidate) KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP #undef KVM_X86_OP

View File

@ -1922,7 +1922,7 @@ struct kvm_x86_ops {
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
}; };
struct kvm_x86_nested_ops { struct kvm_x86_nested_ops {
@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level); int tdp_max_root_level, int tdp_huge_page_level);
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif #endif
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)

View File

@ -46,8 +46,8 @@ config KVM_X86
select HAVE_KVM_PM_NOTIFIER if PM select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_HARDWARE_ENABLING
select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_GENERIC_PRE_FAULT_MEMORY
select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
select KVM_WERROR if WERROR select KVM_WERROR if WERROR
select KVM_GUEST_MEMFD if X86_64
config KVM config KVM
tristate "Kernel-based Virtual Machine (KVM) support" tristate "Kernel-based Virtual Machine (KVM) support"
@ -74,7 +74,7 @@ config KVM_WERROR
# FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
# Building KVM with -Werror and KASAN is still doable via enabling # Building KVM with -Werror and KASAN is still doable via enabling
# the kernel-wide WERROR=y. # the kernel-wide WERROR=y.
depends on KVM && ((EXPERT && !KASAN) || WERROR) depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
help help
Add -Werror to the build flags for KVM. Add -Werror to the build flags for KVM.
@ -83,7 +83,8 @@ config KVM_WERROR
config KVM_SW_PROTECTED_VM config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs" bool "Enable support for KVM software-protected VMs"
depends on EXPERT depends on EXPERT
depends on KVM && X86_64 depends on KVM_X86 && X86_64
select KVM_GENERIC_MEMORY_ATTRIBUTES
help help
Enable support for KVM software-protected VMs. Currently, software- Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for protected VMs are purely a development and testing vehicle for
@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support" tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL depends on KVM && IA32_FEAT_CTL
select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help help
Provides support for KVM on processors equipped with Intel's VT Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX). extensions, a.k.a. Virtual Machine Extensions (VMX).
@ -135,6 +134,8 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support" bool "Intel Trust Domain Extensions (TDX) support"
default y default y
depends on INTEL_TDX_HOST depends on INTEL_TDX_HOST
select KVM_GENERIC_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help help
Provides support for launching Intel Trust Domain Extensions (TDX) Provides support for launching Intel Trust Domain Extensions (TDX)
confidential VMs on Intel processors. confidential VMs on Intel processors.
@ -157,9 +158,10 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64 depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM select ARCH_HAS_CC_PLATFORM
select KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
help help
Provides support for launching encrypted VMs which use Secure Provides support for launching encrypted VMs which use Secure
Encrypted Virtualization (SEV), Secure Encrypted Virtualization with Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
@ -169,7 +171,7 @@ config KVM_AMD_SEV
config KVM_IOAPIC config KVM_IOAPIC
bool "I/O APIC, PIC, and PIT emulation" bool "I/O APIC, PIC, and PIT emulation"
default y default y
depends on KVM depends on KVM_X86
help help
Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
for full in-kernel APIC emulation. for full in-kernel APIC emulation.
@ -179,7 +181,7 @@ config KVM_IOAPIC
config KVM_SMM config KVM_SMM
bool "System Management Mode emulation" bool "System Management Mode emulation"
default y default y
depends on KVM depends on KVM_X86
help help
Provides support for KVM to emulate System Management Mode (SMM) Provides support for KVM to emulate System Management Mode (SMM)
in virtual machines. This can be used by the virtual machine in virtual machines. This can be used by the virtual machine
@ -189,7 +191,7 @@ config KVM_SMM
config KVM_HYPERV config KVM_HYPERV
bool "Support for Microsoft Hyper-V emulation" bool "Support for Microsoft Hyper-V emulation"
depends on KVM depends on KVM_X86
default y default y
help help
Provides KVM support for emulating Microsoft Hyper-V. This allows KVM Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
@ -203,7 +205,7 @@ config KVM_HYPERV
config KVM_XEN config KVM_XEN
bool "Support for Xen hypercall interface" bool "Support for Xen hypercall interface"
depends on KVM depends on KVM_X86
help help
Provides KVM support for the hosting Xen HVM guests and Provides KVM support for the hosting Xen HVM guests and
passing Xen hypercalls to userspace. passing Xen hypercalls to userspace.
@ -213,7 +215,7 @@ config KVM_XEN
config KVM_PROVE_MMU config KVM_PROVE_MMU
bool "Prove KVM MMU correctness" bool "Prove KVM MMU correctness"
depends on DEBUG_KERNEL depends on DEBUG_KERNEL
depends on KVM depends on KVM_X86
depends on EXPERT depends on EXPERT
help help
Enables runtime assertions in KVM's MMU that are too costly to enable Enables runtime assertions in KVM's MMU that are too costly to enable
@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING
config KVM_MAX_NR_VCPUS config KVM_MAX_NR_VCPUS
int "Maximum number of vCPUs per KVM guest" int "Maximum number of vCPUs per KVM guest"
depends on KVM depends on KVM_X86
range 1024 4096 range 1024 4096
default 4096 if MAXSMP default 4096 if MAXSMP
default 1024 default 1024

View File

@ -3285,12 +3285,72 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
return level; return level;
} }
static int __kvm_mmu_max_mapping_level(struct kvm *kvm, static u8 kvm_max_level_for_order(int order)
const struct kvm_memory_slot *slot, {
gfn_t gfn, int max_level, bool is_private) BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
return PG_LEVEL_1G;
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn,
bool is_private)
{
u8 max_level, coco_level;
kvm_pfn_t pfn;
/* For faults, use the gmem information that was resolved earlier. */
if (fault) {
pfn = fault->pfn;
max_level = fault->max_level;
} else {
/* TODO: Call into guest_memfd once hugepages are supported. */
WARN_ONCE(1, "Get pfn+order from guest_memfd");
pfn = KVM_PFN_ERR_FAULT;
max_level = PG_LEVEL_4K;
}
if (max_level == PG_LEVEL_4K)
return max_level;
/*
* CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
* restrictions. A return of '0' means "no additional restrictions", to
* allow for using an optional "ret0" static call.
*/
coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
if (coco_level)
max_level = min(max_level, coco_level);
return max_level;
}
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn)
{ {
struct kvm_lpage_info *linfo; struct kvm_lpage_info *linfo;
int host_level; int host_level, max_level;
bool is_private;
lockdep_assert_held(&kvm->mmu_lock);
if (fault) {
max_level = fault->max_level;
is_private = fault->is_private;
} else {
max_level = PG_LEVEL_NUM;
is_private = kvm_mem_is_private(kvm, gfn);
}
max_level = min(max_level, max_huge_page_level); max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) { for ( ; max_level > PG_LEVEL_4K; max_level--) {
@ -3299,25 +3359,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
break; break;
} }
if (is_private)
return max_level;
if (max_level == PG_LEVEL_4K) if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K; return PG_LEVEL_4K;
host_level = host_pfn_mapping_level(kvm, gfn, slot); if (is_private || kvm_memslot_is_gmem_only(slot))
host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
is_private);
else
host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level); return min(host_level, max_level);
} }
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
}
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ {
struct kvm_memory_slot *slot = fault->slot; struct kvm_memory_slot *slot = fault->slot;
@ -3338,9 +3390,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested * Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting. * level, which will be used to do precise, accurate accounting.
*/ */
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
fault->gfn, fault->max_level, fault->slot, fault->gfn);
fault->is_private);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return; return;
@ -4503,42 +4554,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
vcpu->stat.pf_fixed++; vcpu->stat.pf_fixed++;
} }
static inline u8 kvm_max_level_for_order(int order)
{
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
return PG_LEVEL_1G;
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
u8 max_level, int gmem_order)
{
u8 req_max_level;
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
max_level = min(kvm_max_level_for_order(gmem_order), max_level);
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
if (req_max_level)
max_level = min(max_level, req_max_level);
return max_level;
}
static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int r) struct kvm_page_fault *fault, int r)
{ {
@ -4546,12 +4561,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
r == RET_PF_RETRY, fault->map_writable); r == RET_PF_RETRY, fault->map_writable);
} }
static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault) struct kvm_page_fault *fault)
{ {
int max_order, r; int max_order, r;
if (!kvm_slot_can_be_private(fault->slot)) { if (!kvm_slot_has_gmem(fault->slot)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault); kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT; return -EFAULT;
} }
@ -4564,8 +4579,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
} }
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, fault->max_level = kvm_max_level_for_order(max_order);
fault->max_level, max_order);
return RET_PF_CONTINUE; return RET_PF_CONTINUE;
} }
@ -4575,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
{ {
unsigned int foll = fault->write ? FOLL_WRITE : 0; unsigned int foll = fault->write ? FOLL_WRITE : 0;
if (fault->is_private) if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
return kvm_mmu_faultin_pfn_private(vcpu, fault); return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
foll |= FOLL_NOWAIT; foll |= FOLL_NOWAIT;
fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
@ -7165,7 +7179,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* mapping if the indirect sp has level = 1. * mapping if the indirect sp has level = 1.
*/ */
if (sp->role.direct && if (sp->role.direct &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range()) if (kvm_available_flush_remote_tlbs_range())

View File

@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return r; return r;
} }
int kvm_mmu_max_mapping_level(struct kvm *kvm, int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn); const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

View File

@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
if (iter.gfn < start || iter.gfn >= end) if (iter.gfn < start || iter.gfn >= end)
continue; continue;
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
if (max_mapping_level < iter.level) if (max_mapping_level < iter.level)
continue; continue;

View File

@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->slots_lock);
memslot = gfn_to_memslot(kvm, params.gfn_start); memslot = gfn_to_memslot(kvm, params.gfn_start);
if (!kvm_slot_can_be_private(memslot)) { if (!kvm_slot_has_gmem(memslot)) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
} }
slot = gfn_to_memslot(kvm, gfn); slot = gfn_to_memslot(kvm, gfn);
if (!kvm_slot_can_be_private(slot)) { if (!kvm_slot_has_gmem(slot)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
gpa); gpa);
return; return;
@ -4943,7 +4943,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
} }
} }
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{ {
int level, rc; int level, rc;
bool assigned; bool assigned;

View File

@ -5180,7 +5180,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare, .gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate, .gmem_invalidate = sev_gmem_invalidate,
.private_max_mapping_level = sev_private_max_mapping_level, .gmem_max_mapping_level = sev_gmem_max_mapping_level,
}; };
/* /*

View File

@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else #else
@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
return 0; return 0;
} }
static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{ {
return 0; return 0;
} }

View File

@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp); return tdx_vcpu_ioctl(vcpu, argp);
} }
static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
bool is_private)
{ {
if (is_td(kvm)) if (is_td(kvm))
return tdx_gmem_private_max_mapping_level(kvm, pfn); return tdx_gmem_max_mapping_level(kvm, pfn, is_private);
return 0; return 0;
} }
@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
.private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
}; };
struct kvm_x86_init_ops vt_init_ops __initdata = { struct kvm_x86_init_ops vt_init_ops __initdata = {

View File

@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return ret; return ret;
} }
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{ {
if (!is_private)
return 0;
return PG_LEVEL_4K; return PG_LEVEL_4K;
} }

View File

@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
#endif #endif
#endif /* __KVM_X86_VMX_X86_OPS_H */ #endif /* __KVM_X86_VMX_X86_OPS_H */

View File

@ -13521,6 +13521,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
} }
EXPORT_SYMBOL_GPL(kvm_arch_no_poll); EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
#ifdef CONFIG_KVM_GUEST_MEMFD
/*
* KVM doesn't yet support mmap() on guest_memfd for VMs with private memory
* (the private vs. shared tracking needs to be moved into guest_memfd).
*/
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm)
{
return !kvm_arch_has_private_mem(kvm);
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{ {
@ -13534,6 +13544,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
kvm_x86_call(gmem_invalidate)(start, end); kvm_x86_call(gmem_invalidate)(start, end);
} }
#endif #endif
#endif
int kvm_spec_ctrl_test_value(u64 value) int kvm_spec_ctrl_test_value(u64 value)
{ {

View File

@ -52,9 +52,10 @@
/* /*
* The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally
* used in kvm, other bits are visible for userspace which are defined in * used in kvm, other bits are visible for userspace which are defined in
* include/linux/kvm_h. * include/uapi/linux/kvm.h.
*/ */
#define KVM_MEMSLOT_INVALID (1UL << 16) #define KVM_MEMSLOT_INVALID (1UL << 16)
#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17)
/* /*
* Bit 63 of the memslot generation number is an "update in-progress flag", * Bit 63 of the memslot generation number is an "update in-progress flag",
@ -602,7 +603,7 @@ struct kvm_memory_slot {
short id; short id;
u16 as_id; u16 as_id;
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GUEST_MEMFD
struct { struct {
/* /*
* Writes protected by kvm->slots_lock. Acquiring a * Writes protected by kvm->slots_lock. Acquiring a
@ -615,7 +616,7 @@ struct kvm_memory_slot {
#endif #endif
}; };
static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot)
{ {
return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
} }
@ -719,17 +720,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
} }
#endif #endif
/* #ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
* Arch code must define kvm_arch_has_private_mem if support for private memory
* is enabled.
*/
#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
static inline bool kvm_arch_has_private_mem(struct kvm *kvm) static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{ {
return false; return false;
} }
#endif #endif
#ifdef CONFIG_KVM_GUEST_MEMFD
bool kvm_arch_supports_gmem_mmap(struct kvm *kvm);
#endif
#ifndef kvm_arch_has_readonly_mem #ifndef kvm_arch_has_readonly_mem
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
{ {
@ -860,7 +861,7 @@ struct kvm {
struct notifier_block pm_notifier; struct notifier_block pm_notifier;
#endif #endif
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
/* Protected by slots_locks (for writes) and RCU (for reads) */ /* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array; struct xarray mem_attr_array;
#endif #endif
char stats_id[KVM_STATS_NAME_SIZE]; char stats_id[KVM_STATS_NAME_SIZE];
@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
} }
static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
{
if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
return false;
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{ {
@ -2505,8 +2514,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{ {
return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
} }
#else #else
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
@ -2515,7 +2523,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
} }
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page, gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
int *max_order); int *max_order);
@ -2528,13 +2536,13 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
KVM_BUG_ON(1, kvm); KVM_BUG_ON(1, kvm);
return -EIO; return -EIO;
} }
#endif /* CONFIG_KVM_PRIVATE_MEM */ #endif /* CONFIG_KVM_GUEST_MEMFD */
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
#endif #endif
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
/** /**
* kvm_gmem_populate() - Populate/prepare a GPA range with guest data * kvm_gmem_populate() - Populate/prepare a GPA range with guest data
* *

View File

@ -962,6 +962,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_ARM_EL2_E2H0 241
#define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_RISCV_MP_STATE_RESET 242
#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
#define KVM_CAP_GUEST_MEMFD_MMAP 244
struct kvm_irq_routing_irqchip { struct kvm_irq_routing_irqchip {
__u32 irqchip; __u32 irqchip;
@ -1598,6 +1599,7 @@ struct kvm_memory_attributes {
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
struct kvm_create_guest_memfd { struct kvm_create_guest_memfd {
__u64 size; __u64 size;

View File

@ -174,6 +174,7 @@ TEST_GEN_PROGS_arm64 += arch_timer
TEST_GEN_PROGS_arm64 += coalesced_io_test TEST_GEN_PROGS_arm64 += coalesced_io_test
TEST_GEN_PROGS_arm64 += dirty_log_perf_test TEST_GEN_PROGS_arm64 += dirty_log_perf_test
TEST_GEN_PROGS_arm64 += get-reg-list TEST_GEN_PROGS_arm64 += get-reg-list
TEST_GEN_PROGS_arm64 += guest_memfd_test
TEST_GEN_PROGS_arm64 += memslot_modification_stress_test TEST_GEN_PROGS_arm64 += memslot_modification_stress_test
TEST_GEN_PROGS_arm64 += memslot_perf_test TEST_GEN_PROGS_arm64 += memslot_perf_test
TEST_GEN_PROGS_arm64 += mmu_stress_test TEST_GEN_PROGS_arm64 += mmu_stress_test

View File

@ -13,12 +13,16 @@
#include <linux/bitmap.h> #include <linux/bitmap.h>
#include <linux/falloc.h> #include <linux/falloc.h>
#include <linux/sizes.h>
#include <setjmp.h>
#include <signal.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include "kvm_util.h" #include "kvm_util.h"
#include "test_util.h" #include "test_util.h"
#include "ucall_common.h"
static void test_file_read_write(int fd) static void test_file_read_write(int fd)
{ {
@ -34,12 +38,83 @@ static void test_file_read_write(int fd)
"pwrite on a guest_mem fd should fail"); "pwrite on a guest_mem fd should fail");
} }
static void test_mmap(int fd, size_t page_size) static void test_mmap_supported(int fd, size_t page_size, size_t total_size)
{
const char val = 0xaa;
char *mem;
size_t i;
int ret;
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd.");
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
memset(mem, val, total_size);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0,
page_size);
TEST_ASSERT(!ret, "fallocate the first page should succeed.");
for (i = 0; i < page_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), 0x00);
for (; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
memset(mem, val, page_size);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = munmap(mem, total_size);
TEST_ASSERT(!ret, "munmap() should succeed.");
}
static sigjmp_buf jmpbuf;
void fault_sigbus_handler(int signum)
{
siglongjmp(jmpbuf, 1);
}
static void test_fault_overflow(int fd, size_t page_size, size_t total_size)
{
struct sigaction sa_old, sa_new = {
.sa_handler = fault_sigbus_handler,
};
size_t map_size = total_size * 4;
const char val = 0xaa;
char *mem;
size_t i;
int ret;
mem = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed.");
sigaction(SIGBUS, &sa_new, &sa_old);
if (sigsetjmp(jmpbuf, 1) == 0) {
memset(mem, 0xaa, map_size);
TEST_ASSERT(false, "memset() should have triggered SIGBUS.");
}
sigaction(SIGBUS, &sa_old, NULL);
for (i = 0; i < total_size; i++)
TEST_ASSERT_EQ(READ_ONCE(mem[i]), val);
ret = munmap(mem, map_size);
TEST_ASSERT(!ret, "munmap() should succeed.");
}
static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size)
{ {
char *mem; char *mem;
mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT_EQ(mem, MAP_FAILED); TEST_ASSERT_EQ(mem, MAP_FAILED);
mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT_EQ(mem, MAP_FAILED);
} }
static void test_file_size(int fd, size_t page_size, size_t total_size) static void test_file_size(int fd, size_t page_size, size_t total_size)
@ -120,80 +195,187 @@ static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size)
} }
} }
static void test_create_guest_memfd_invalid(struct kvm_vm *vm) static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm,
uint64_t guest_memfd_flags,
size_t page_size)
{ {
size_t page_size = getpagesize();
uint64_t flag;
size_t size; size_t size;
int fd; int fd;
for (size = 1; size < page_size; size++) { for (size = 1; size < page_size; size++) {
fd = __vm_create_guest_memfd(vm, size, 0); fd = __vm_create_guest_memfd(vm, size, guest_memfd_flags);
TEST_ASSERT(fd == -1 && errno == EINVAL, TEST_ASSERT(fd < 0 && errno == EINVAL,
"guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL", "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL",
size); size);
} }
for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
TEST_ASSERT(fd == -1 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
flag);
}
} }
static void test_create_guest_memfd_multiple(struct kvm_vm *vm) static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
{ {
int fd1, fd2, ret; int fd1, fd2, ret;
struct stat st1, st2; struct stat st1, st2;
size_t page_size = getpagesize();
fd1 = __vm_create_guest_memfd(vm, 4096, 0); fd1 = __vm_create_guest_memfd(vm, page_size, 0);
TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); TEST_ASSERT(fd1 != -1, "memfd creation should succeed");
ret = fstat(fd1, &st1); ret = fstat(fd1, &st1);
TEST_ASSERT(ret != -1, "memfd fstat should succeed"); TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size"); TEST_ASSERT(st1.st_size == page_size, "memfd st_size should match requested size");
fd2 = __vm_create_guest_memfd(vm, 8192, 0); fd2 = __vm_create_guest_memfd(vm, page_size * 2, 0);
TEST_ASSERT(fd2 != -1, "memfd creation should succeed"); TEST_ASSERT(fd2 != -1, "memfd creation should succeed");
ret = fstat(fd2, &st2); ret = fstat(fd2, &st2);
TEST_ASSERT(ret != -1, "memfd fstat should succeed"); TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size"); TEST_ASSERT(st2.st_size == page_size * 2, "second memfd st_size should match requested size");
ret = fstat(fd1, &st1); ret = fstat(fd1, &st1);
TEST_ASSERT(ret != -1, "memfd fstat should succeed"); TEST_ASSERT(ret != -1, "memfd fstat should succeed");
TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); TEST_ASSERT(st1.st_size == page_size, "first memfd st_size should still match requested size");
TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers");
close(fd2); close(fd2);
close(fd1); close(fd1);
} }
int main(int argc, char *argv[]) static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags)
{ {
size_t page_size; size_t page_size = getpagesize();
size_t total_size; uint64_t flag;
int fd; int fd;
struct kvm_vm *vm;
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
if (flag & valid_flags) {
TEST_ASSERT(fd >= 0,
"guest_memfd() with flag '0x%lx' should succeed",
flag);
close(fd);
} else {
TEST_ASSERT(fd < 0 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
flag);
}
}
}
static void test_guest_memfd(unsigned long vm_type)
{
uint64_t flags = 0;
struct kvm_vm *vm;
size_t total_size;
size_t page_size;
int fd;
page_size = getpagesize(); page_size = getpagesize();
total_size = page_size * 4; total_size = page_size * 4;
vm = vm_create_barebones(); vm = vm_create_barebones_type(vm_type);
if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP))
flags |= GUEST_MEMFD_FLAG_MMAP;
test_create_guest_memfd_invalid(vm);
test_create_guest_memfd_multiple(vm); test_create_guest_memfd_multiple(vm);
test_create_guest_memfd_invalid_sizes(vm, flags, page_size);
fd = vm_create_guest_memfd(vm, total_size, 0); fd = vm_create_guest_memfd(vm, total_size, flags);
test_file_read_write(fd); test_file_read_write(fd);
test_mmap(fd, page_size);
if (flags & GUEST_MEMFD_FLAG_MMAP) {
test_mmap_supported(fd, page_size, total_size);
test_fault_overflow(fd, page_size, total_size);
} else {
test_mmap_not_supported(fd, page_size, total_size);
}
test_file_size(fd, page_size, total_size); test_file_size(fd, page_size, total_size);
test_fallocate(fd, page_size, total_size); test_fallocate(fd, page_size, total_size);
test_invalid_punch_hole(fd, page_size, total_size); test_invalid_punch_hole(fd, page_size, total_size);
test_guest_memfd_flags(vm, flags);
close(fd); close(fd);
kvm_vm_free(vm);
}
static void guest_code(uint8_t *mem, uint64_t size)
{
size_t i;
for (i = 0; i < size; i++)
__GUEST_ASSERT(mem[i] == 0xaa,
"Guest expected 0xaa at offset %lu, got 0x%x", i, mem[i]);
memset(mem, 0xff, size);
GUEST_DONE();
}
static void test_guest_memfd_guest(void)
{
/*
* Skip the first 4gb and slot0. slot0 maps <1gb and is used to back
* the guest's code, stack, and page tables, and low memory contains
* the PCI hole and other MMIO regions that need to be avoided.
*/
const uint64_t gpa = SZ_4G;
const int slot = 1;
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
uint8_t *mem;
size_t size;
int fd, i;
if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP))
return;
vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code);
TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP),
"Default VM type should always support guest_memfd mmap()");
size = vm->page_size;
fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP);
vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0);
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
memset(mem, 0xaa, size);
munmap(mem, size);
virt_pg_map(vm, gpa, gpa);
vcpu_args_set(vcpu, 2, gpa, size);
vcpu_run(vcpu);
TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed");
for (i = 0; i < size; i++)
TEST_ASSERT_EQ(mem[i], 0xff);
close(fd);
kvm_vm_free(vm);
}
int main(int argc, char *argv[])
{
unsigned long vm_types, vm_type;
TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
/*
* Not all architectures support KVM_CAP_VM_TYPES. However, those that
* support guest_memfd have that support for the default VM type.
*/
vm_types = kvm_check_cap(KVM_CAP_VM_TYPES);
if (!vm_types)
vm_types = BIT(VM_TYPE_DEFAULT);
for_each_set_bit(vm_type, &vm_types, BITS_PER_TYPE(vm_types))
test_guest_memfd(vm_type);
test_guest_memfd_guest();
} }

View File

@ -112,19 +112,18 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
depends on KVM_GENERIC_MMU_NOTIFIER depends on KVM_GENERIC_MMU_NOTIFIER
bool bool
config KVM_PRIVATE_MEM config KVM_GUEST_MEMFD
select XARRAY_MULTI select XARRAY_MULTI
bool bool
config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
bool
config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_PREPARE
bool bool
depends on KVM_PRIVATE_MEM depends on KVM_GUEST_MEMFD
config HAVE_KVM_ARCH_GMEM_INVALIDATE config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool bool
depends on KVM_PRIVATE_MEM depends on KVM_GUEST_MEMFD
config HAVE_KVM_ARCH_GMEM_POPULATE
bool
depends on KVM_GUEST_MEMFD

View File

@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o

View File

@ -312,7 +312,74 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff; return gfn - slot->base_gfn + slot->gmem.pgoff;
} }
static bool kvm_gmem_supports_mmap(struct inode *inode)
{
const u64 flags = (u64)inode->i_private;
return flags & GUEST_MEMFD_FLAG_MMAP;
}
static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct folio *folio;
vm_fault_t ret = VM_FAULT_LOCKED;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
int err = PTR_ERR(folio);
if (err == -EAGAIN)
return VM_FAULT_RETRY;
return vmf_error(err);
}
if (WARN_ON_ONCE(folio_test_large(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_folio;
}
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
kvm_gmem_mark_prepared(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
out_folio:
if (ret != VM_FAULT_LOCKED) {
folio_unlock(folio);
folio_put(folio);
}
return ret;
}
static const struct vm_operations_struct kvm_gmem_vm_ops = {
.fault = kvm_gmem_fault_user_mapping,
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!kvm_gmem_supports_mmap(file_inode(file)))
return -ENODEV;
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
(VM_SHARED | VM_MAYSHARE)) {
return -EINVAL;
}
vma->vm_ops = &kvm_gmem_vm_ops;
return 0;
}
static struct file_operations kvm_gmem_fops = { static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open, .open = generic_file_open,
.release = kvm_gmem_release, .release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate, .fallocate = kvm_gmem_fallocate,
@ -391,6 +458,11 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr = kvm_gmem_setattr, .setattr = kvm_gmem_setattr,
}; };
bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm)
{
return true;
}
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{ {
const char *anon_name = "[kvm-gmem]"; const char *anon_name = "[kvm-gmem]";
@ -452,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
u64 flags = args->flags; u64 flags = args->flags;
u64 valid_flags = 0; u64 valid_flags = 0;
if (kvm_arch_supports_gmem_mmap(kvm))
valid_flags |= GUEST_MEMFD_FLAG_MMAP;
if (flags & ~valid_flags) if (flags & ~valid_flags)
return -EINVAL; return -EINVAL;
@ -508,6 +583,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
*/ */
WRITE_ONCE(slot->gmem.file, file); WRITE_ONCE(slot->gmem.file, file);
slot->gmem.pgoff = start; slot->gmem.pgoff = start;
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping); filemap_invalidate_unlock(inode->i_mapping);
@ -627,7 +704,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
} }
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque) kvm_gmem_populate_cb post_populate, void *opaque)
{ {
@ -643,7 +720,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
return -EINVAL; return -EINVAL;
slot = gfn_to_memslot(kvm, start_gfn); slot = gfn_to_memslot(kvm, start_gfn);
if (!kvm_slot_can_be_private(slot)) if (!kvm_slot_has_gmem(slot))
return -EINVAL; return -EINVAL;
file = kvm_gmem_get_file(slot); file = kvm_gmem_get_file(slot);

View File

@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
* All current use cases for flushing the TLBs for a specific memslot * All current use cases for flushing the TLBs for a specific memslot
* are related to dirty logging, and many do the TLB flush out of * are related to dirty logging, and many do the TLB flush out of
* mmu_lock. The interaction between the various operations on memslot * mmu_lock. The interaction between the various operations on memslot
* must be serialized by slots_locks to ensure the TLB flush from one * must be serialized by slots_lock to ensure the TLB flush from one
* operation is observed by any other operation on the same memslot. * operation is observed by any other operation on the same memslot.
*/ */
lockdep_assert_held(&kvm->slots_lock); lockdep_assert_held(&kvm->slots_lock);
@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm,
{ {
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
if (kvm_arch_has_private_mem(kvm)) if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
valid_flags |= KVM_MEM_GUEST_MEMFD; valid_flags |= KVM_MEM_GUEST_MEMFD;
/* Dirty logging private memory is not currently supported. */ /* Dirty logging private memory is not currently supported. */
@ -4915,9 +4915,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_MEMORY_ATTRIBUTES: case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm); return kvm_supported_mem_attributes(kvm);
#endif #endif
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GUEST_MEMFD
case KVM_CAP_GUEST_MEMFD: case KVM_CAP_GUEST_MEMFD:
return !kvm || kvm_arch_has_private_mem(kvm); return 1;
case KVM_CAP_GUEST_MEMFD_MMAP:
return !kvm || kvm_arch_supports_gmem_mmap(kvm);
#endif #endif
default: default:
break; break;
@ -5352,7 +5354,7 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_GET_STATS_FD: case KVM_GET_STATS_FD:
r = kvm_vm_ioctl_get_stats_fd(kvm); r = kvm_vm_ioctl_get_stats_fd(kvm);
break; break;
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GUEST_MEMFD
case KVM_CREATE_GUEST_MEMFD: { case KVM_CREATE_GUEST_MEMFD: {
struct kvm_create_guest_memfd guest_memfd; struct kvm_create_guest_memfd guest_memfd;

View File

@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
} }
#endif /* HAVE_KVM_PFNCACHE */ #endif /* HAVE_KVM_PFNCACHE */
#ifdef CONFIG_KVM_PRIVATE_MEM #ifdef CONFIG_KVM_GUEST_MEMFD
void kvm_gmem_init(struct module *module); void kvm_gmem_init(struct module *module);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{ {
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
} }
#endif /* CONFIG_KVM_PRIVATE_MEM */ #endif /* CONFIG_KVM_GUEST_MEMFD */
#endif /* __KVM_MM_H__ */ #endif /* __KVM_MM_H__ */