summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-02-09 19:08:17 +0100
committerPaolo Bonzini <pbonzini@redhat.com>2026-02-11 12:45:12 -0500
commit9123c5f956b1fbedd63821eb528ece55ddd0e49c (patch)
treeefdc67b3e97c7df743b18d5d80182ebd8d101378
parent54f15ebfc61ee8499a97f2dbfc18b1b13fdcb524 (diff)
parent2a62345b30529e488beb6a1220577b3495933724 (diff)
Merge tag 'kvm-x86-gmem-6.20' of https://github.com/kvm-x86/linux into HEAD
KVM guest_memfd changes for 6.20 - Remove kvm_gmem_populate()'s preparation tracking and half-baked hugepage handling, and instead rely on SNP (the only user of the tracking) to do its own tracking via the RMP. - Retroactively document and enforce (for SNP) that KVM_SEV_SNP_LAUNCH_UPDATE and KVM_TDX_INIT_MEM_REGION require the source page to be 4KiB aligned, to avoid non-trivial complexity for a non-existent usecase (and because in-place conversion simply can't support unaligned sources). - When populating guest_memfd memory, GUP the source page in common code and pass the refcounted page to the vendor callback, instead of letting vendor code do the heavy lifting. Doing so avoids a looming deadlock bug with in-place due an AB-BA conflict betwee mmap_lock and guest_memfd's filemap invalidate lock.
-rw-r--r--Documentation/virt/kvm/x86/amd-memory-encryption.rst2
-rw-r--r--Documentation/virt/kvm/x86/intel-tdx.rst2
-rw-r--r--arch/x86/kvm/svm/sev.c108
-rw-r--r--arch/x86/kvm/vmx/tdx.c16
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--virt/kvm/guest_memfd.c139
6 files changed, 130 insertions, 141 deletions
diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
index 543b5e5dd8d4..b2395dd4769d 100644
--- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
@@ -523,7 +523,7 @@ Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry
struct kvm_sev_snp_launch_update {
__u64 gfn_start; /* Guest page number to load/encrypt data into. */
- __u64 uaddr; /* Userspace address of data to be loaded/encrypted. */
+ __u64 uaddr; /* 4k-aligned address of data to be loaded/encrypted. */
__u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/
__u8 type; /* The type of the guest pages being initialized. */
__u8 pad0;
diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst
index 5efac62c92c7..6a222e9d0954 100644
--- a/Documentation/virt/kvm/x86/intel-tdx.rst
+++ b/Documentation/virt/kvm/x86/intel-tdx.rst
@@ -156,7 +156,7 @@ KVM_TDX_INIT_MEM_REGION
:Returns: 0 on success, <0 on error
Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
-provided data from @source_addr.
+provided data from @source_addr. @source_addr must be PAGE_SIZE-aligned.
Note, before calling this sub command, memory attribute of the range
[gpa, gpa + nr_pages] needs to be private. Userspace can use
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f9aad5c1447e..ea515cf41168 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2277,66 +2277,52 @@ struct sev_gmem_populate_args {
int fw_error;
};
-static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
- void __user *src, int order, void *opaque)
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ struct page *src_page, void *opaque)
{
struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct sev_data_snp_launch_update fw_args = {0};
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
- int n_private = 0, ret, i;
- int npages = (1 << order);
- gfn_t gfn;
+ bool assigned = false;
+ int level;
+ int ret;
- if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
return -EINVAL;
- for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
- struct sev_data_snp_launch_update fw_args = {0};
- bool assigned = false;
- int level;
-
- ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
- if (ret || assigned) {
- pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
- __func__, gfn, ret, assigned);
- ret = ret ? -EINVAL : -EEXIST;
- goto err;
- }
-
- if (src) {
- void *vaddr = kmap_local_pfn(pfn + i);
-
- if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
- ret = -EFAULT;
- goto err;
- }
- kunmap_local(vaddr);
- }
-
- ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
- sev_get_asid(kvm), true);
- if (ret)
- goto err;
+ ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = ret ? -EINVAL : -EEXIST;
+ goto out;
+ }
- n_private++;
+ if (src_page) {
+ void *src_vaddr = kmap_local_page(src_page);
+ void *dst_vaddr = kmap_local_pfn(pfn);
- fw_args.gctx_paddr = __psp_pa(sev->snp_context);
- fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
- fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
- fw_args.page_type = sev_populate_args->type;
+ memcpy(dst_vaddr, src_vaddr, PAGE_SIZE);
- ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
- &fw_args, &sev_populate_args->fw_error);
- if (ret)
- goto fw_err;
+ kunmap_local(src_vaddr);
+ kunmap_local(dst_vaddr);
}
- return 0;
+ ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto out;
-fw_err:
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
/*
* If the firmware command failed handle the reclaim and cleanup of that
- * PFN specially vs. prior pages which can be cleaned up below without
- * needing to reclaim in advance.
+ * PFN before reporting an error.
*
* Additionally, when invalid CPUID function entries are detected,
* firmware writes the expected values into the page and leaves it
@@ -2346,26 +2332,22 @@ fw_err:
* information to provide information on which CPUID leaves/fields
* failed CPUID validation.
*/
- if (!snp_page_reclaim(kvm, pfn + i) &&
+ if (ret && !snp_page_reclaim(kvm, pfn) &&
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
- void *vaddr = kmap_local_pfn(pfn + i);
+ void *src_vaddr = kmap_local_page(src_page);
+ void *dst_vaddr = kmap_local_pfn(pfn);
- if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
- pr_debug("Failed to write CPUID page back to userspace\n");
+ memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
- kunmap_local(vaddr);
+ kunmap_local(src_vaddr);
+ kunmap_local(dst_vaddr);
}
- /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
- n_private--;
-
-err:
- pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
- __func__, ret, sev_populate_args->fw_error, n_private);
- for (i = 0; i < n_private; i++)
- kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
-
+out:
+ if (ret)
+ pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n",
+ __func__, gfn, ret, sev_populate_args->fw_error);
return ret;
}
@@ -2396,6 +2378,11 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
return -EINVAL;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ if (!PAGE_ALIGNED(src))
+ return -EINVAL;
+
npages = params.len / PAGE_SIZE;
/*
@@ -2427,7 +2414,6 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_populate_args.sev_fd = argp->sev_fd;
sev_populate_args.type = params.type;
- src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
sev_gmem_post_populate, &sev_populate_args);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 2d7a4d52ccfb..5df9d32d2058 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3118,34 +3118,24 @@ struct tdx_gmem_post_populate_arg {
};
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *_arg)
+ struct page *src_page, void *_arg)
{
struct tdx_gmem_post_populate_arg *arg = _arg;
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- struct page *src_page;
int ret, i;
if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
return -EIO;
- /*
- * Get the source page if it has been faulted in. Return failure if the
- * source page has been swapped out or unmapped in primary memory.
- */
- ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
- if (ret < 0)
- return ret;
- if (ret != 1)
- return -ENOMEM;
+ if (!src_page)
+ return -EOPNOTSUPP;
kvm_tdx->page_add_src = src_page;
ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
kvm_tdx->page_add_src = NULL;
- put_page(src_page);
-
if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
return ret;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d93f75b05ae2..49c0cfe24fd8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2566,7 +2566,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* @gfn: starting GFN to be populated
* @src: userspace-provided buffer containing data to copy into GFN range
* (passed to @post_populate, and incremented on each iteration
- * if not NULL)
+ * if not NULL). Must be page-aligned.
* @npages: number of pages to copy from userspace-buffer
* @post_populate: callback to issue for each gmem page that backs the GPA
* range
@@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* Returns the number of pages that were populated.
*/
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *opaque);
+ struct page *page, void *opaque);
long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fdaea3422c30..923c51a3a525 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
-static inline void kvm_gmem_mark_prepared(struct folio *folio)
-{
- folio_mark_uptodate(folio);
-}
-
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
@@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, struct folio *folio)
{
- unsigned long nr_pages, i;
pgoff_t index;
- int r;
-
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
/*
* Preparing huge folios should always be safe, since it should
@@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
index = kvm_gmem_get_index(slot, gfn);
index = ALIGN_DOWN(index, folio_nr_pages(folio));
- r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
- if (!r)
- kvm_gmem_mark_prepared(folio);
- return r;
+ return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
}
/*
@@ -151,6 +137,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
mapping_gfp_mask(inode->i_mapping), policy);
mpol_cond_put(policy);
+ /*
+ * External interfaces like kvm_gmem_get_pfn() support dealing
+ * with hugepages to a degree, but internally, guest_memfd currently
+ * assumes that all folios are order-0 and handling would need
+ * to be updated for anything otherwise (e.g. page-clearing
+ * operations).
+ */
+ WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
+
return folio;
}
@@ -420,7 +415,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
- kvm_gmem_mark_prepared(folio);
+ folio_mark_uptodate(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
@@ -757,7 +752,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
static struct folio *__kvm_gmem_get_pfn(struct file *file,
struct kvm_memory_slot *slot,
pgoff_t index, kvm_pfn_t *pfn,
- bool *is_prepared, int *max_order)
+ int *max_order)
{
struct file *slot_file = READ_ONCE(slot->gmem.file);
struct gmem_file *f = file->private_data;
@@ -787,7 +782,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
if (max_order)
*max_order = 0;
- *is_prepared = folio_test_uptodate(folio);
return folio;
}
@@ -797,19 +791,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
struct folio *folio;
- bool is_prepared = false;
int r = 0;
CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
- folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
+ folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (!is_prepared)
- r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+ if (!folio_test_uptodate(folio)) {
+ clear_highpage(folio_page(folio, 0));
+ folio_mark_uptodate(folio);
+ }
+
+ r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
folio_unlock(folio);
@@ -823,13 +820,49 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
+
+static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
+ struct file *file, gfn_t gfn, struct page *src_page,
+ kvm_gmem_populate_cb post_populate, void *opaque)
+{
+ pgoff_t index = kvm_gmem_get_index(slot, gfn);
+ struct folio *folio;
+ kvm_pfn_t pfn;
+ int ret;
+
+ filemap_invalidate_lock(file->f_mapping);
+
+ folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out_unlock;
+ }
+
+ folio_unlock(folio);
+
+ if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ ret = -EINVAL;
+ goto out_put_folio;
+ }
+
+ ret = post_populate(kvm, gfn, pfn, src_page, opaque);
+ if (!ret)
+ folio_mark_uptodate(folio);
+
+out_put_folio:
+ folio_put(folio);
+out_unlock:
+ filemap_invalidate_unlock(file->f_mapping);
+ return ret;
+}
+
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct kvm_memory_slot *slot;
- void __user *p;
-
- int ret = 0, max_order;
+ int ret = 0;
long i;
lockdep_assert_held(&kvm->slots_lock);
@@ -837,6 +870,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (WARN_ON_ONCE(npages <= 0))
return -EINVAL;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
+ return -EINVAL;
+
slot = gfn_to_memslot(kvm, start_gfn);
if (!kvm_slot_has_gmem(slot))
return -EINVAL;
@@ -845,60 +881,37 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (!file)
return -EFAULT;
- filemap_invalidate_lock(file->f_mapping);
-
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
- for (i = 0; i < npages; i += (1 << max_order)) {
- struct folio *folio;
- gfn_t gfn = start_gfn + i;
- pgoff_t index = kvm_gmem_get_index(slot, gfn);
- bool is_prepared = false;
- kvm_pfn_t pfn;
+ for (i = 0; i < npages; i++) {
+ struct page *src_page = NULL;
if (signal_pending(current)) {
ret = -EINTR;
break;
}
- folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
- if (IS_ERR(folio)) {
- ret = PTR_ERR(folio);
- break;
- }
+ if (src) {
+ unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
- if (is_prepared) {
- folio_unlock(folio);
- folio_put(folio);
- ret = -EEXIST;
- break;
+ ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
+ if (ret < 0)
+ break;
+ if (ret != 1) {
+ ret = -ENOMEM;
+ break;
+ }
}
- folio_unlock(folio);
- WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
- (npages - i) < (1 << max_order));
+ ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
+ post_populate, opaque);
- ret = -EINVAL;
- while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
- KVM_MEMORY_ATTRIBUTE_PRIVATE,
- KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
- if (!max_order)
- goto put_folio_and_exit;
- max_order--;
- }
-
- p = src ? src + i * PAGE_SIZE : NULL;
- ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
- if (!ret)
- kvm_gmem_mark_prepared(folio);
+ if (src_page)
+ put_page(src_page);
-put_folio_and_exit:
- folio_put(folio);
if (ret)
break;
}
- filemap_invalidate_unlock(file->f_mapping);
-
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);