[ based on kvm/next ]
Implement guest_memfd population via the write syscall. This is useful in non-CoCo use cases where the host can access guest memory. Even though the same can also be achieved via userspace mapping and memcpying from userspace, write provides a more performant option because it does not need to set page tables and it does not cause a page fault for every page like memcpy would. Note that memcpy cannot be accelerated via MADV_POPULATE_WRITE as it is not supported by guest_memfd and relies on GUP.
Populating 512MiB of guest_memfd on a x86 machine: - via memcpy: 436 ms - via write: 202 ms (-54%)
The write syscall support is conditional on kvm_gmem_supports_mmap. When in-place shared/private conversion is supported, write should only be allowed on shared pages.
v6: - Make write support conditional on mmap support instead of relying on the up-to-date flag to decide whether writing to a page is allowed - James: Remove depenendencies on folio_test_large - James: Remove page alignment restriction - James: Formatting fixes
v5: - https://lore.kernel.org/kvm/20250902111951.58315-1-kalyazin@amazon.com/ - Replace the call to the unexported filemap_remove_folio with zeroing the bytes that could not be copied - Fix checkpatch findings
v4: - https://lore.kernel.org/kvm/20250828153049.3922-1-kalyazin@amazon.com - Switch from implementing the write callback to write_iter - Remove conditional compilation
v3: - https://lore.kernel.org/kvm/20250303130838.28812-1-kalyazin@amazon.com - David/Mike D: Only compile support for the write syscall if CONFIG_KVM_GMEM_SHARED_MEM (now gone) is enabled. v2: - https://lore.kernel.org/kvm/20241129123929.64790-1-kalyazin@amazon.com - Switch from an ioctl to the write syscall to implement population
v1: - https://lore.kernel.org/kvm/20241024095429.54052-1-kalyazin@amazon.com
Nikita Kalyazin (2): KVM: guest_memfd: add generic population via write KVM: selftests: update guest_memfd write tests
.../testing/selftests/kvm/guest_memfd_test.c | 51 ++++++++++++++++--- virt/kvm/guest_memfd.c | 49 ++++++++++++++++++ 2 files changed, 94 insertions(+), 6 deletions(-)
base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac
From: Nikita Kalyazin kalyazin@amazon.com
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..f4e218049afa 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -380,6 +380,8 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
static struct file_operations kvm_gmem_fops = { .mmap = kvm_gmem_mmap, + .llseek = default_llseek, + .write_iter = generic_perform_write, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -390,6 +392,49 @@ void kvm_gmem_init(struct module *module) kvm_gmem_fops.owner = module; }
+static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + struct folio **foliop, + void **fsdata) +{ + struct file *file = kiocb->ki_filp; + struct inode *inode = file_inode(file); + pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; + + if (!kvm_gmem_supports_mmap(inode)) + return -ENODEV; + + if (pos + len > i_size_read(inode)) + return -EINVAL; + + folio = kvm_gmem_get_folio(inode, index); + if (IS_ERR(folio)) + return -EFAULT; + + *foliop = folio; + return 0; +} + +static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + unsigned int copied, + struct folio *folio, void *fsdata) +{ + if (copied && copied < len) { + unsigned int from = pos & ((1UL << folio_order(folio)) - 1); + + folio_zero_range(folio, from + copied, len - copied); + } + + folio_unlock(folio); + folio_put(folio); + + return copied; +} + static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -442,6 +487,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, + .write_begin = kvm_kmem_gmem_write_begin, + .write_end = kvm_kmem_gmem_write_end, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE @@ -489,6 +536,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) }
file->f_flags |= O_LARGEFILE; + file->f_mode |= FMODE_LSEEK | FMODE_PWRITE;
inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping);
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++
There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
1 file changed, 48 insertions(+)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..f4e218049afa 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -380,6 +380,8 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) static struct file_operations kvm_gmem_fops = { .mmap = kvm_gmem_mmap,
- .llseek = default_llseek,
- .write_iter = generic_perform_write, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate,
@@ -390,6 +392,49 @@ void kvm_gmem_init(struct module *module) kvm_gmem_fops.owner = module; } +static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb,
struct address_space *mapping,loff_t pos, unsigned int len,struct folio **foliop,void **fsdata)
Over-aggressive wrapping, this can be
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata)
or
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata)
if we want to bundle pos+len.
+{
- struct file *file = kiocb->ki_filp;
ki_filp is already a file, and even if it were a "void *", there's no need for a local variable.
- struct inode *inode = file_inode(file);
- pgoff_t index = pos >> PAGE_SHIFT;
- struct folio *folio;
- if (!kvm_gmem_supports_mmap(inode))
Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
return -ENODEV;- if (pos + len > i_size_read(inode))
return -EINVAL;- folio = kvm_gmem_get_folio(inode, index);
Eh, since "index" is only used once, my vote is to use "pos" and do the shift here, so that it's obvious that the input to kvm_gmem_get_folio() is being checked.
- if (IS_ERR(folio))
return -EFAULT;
Why EFAULT?
- *foliop = folio;
There shouldn't be any need for a local "folio". What about having the "out" param be just "folio"?
E.g.
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata) { struct inode *inode = file_inode(kiocb->ki_filp);
if (!kvm_gmem_supports_write(inode)) return -ENODEV;
if (pos + len > i_size_read(inode)) return -EINVAL;
*folio = kvm_gmem_get_folio(inode, pos >> PAGE_SHIFT); if (IS_ERR(*folio)) return PTR_ERR(*folio);
return 0; }
- return 0;
+}
+static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb,
struct address_space *mapping,loff_t pos, unsigned int len,unsigned int copied,struct folio *folio, void *fsdata)+{
- if (copied && copied < len) {
Why check if "copied" is non-zero? I don't see why KVM should behave differently with respect to unwritten bytes if copy_folio_from_iter_atomic() fails on the first byte or the Nth byte.
unsigned int from = pos & ((1UL << folio_order(folio)) - 1);
Uh, isn't this just offset_in_folio()?
folio_zero_range(folio, from + copied, len - copied);
I'd probably be in favor of omitting "from" entirely, e.g.
if (copied < len) folio_zero_range(folio, offset_in_folio(pos) + copied, len - copied);
- }
- folio_unlock(folio);
- folio_put(folio);
- return copied;
+}
On 23/10/2025 17:07, Sean Christopherson wrote:
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
+ Vishal and Ackerley
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Hi Sean,
Thanks for the review.
Do you think including the explanation from the cover letter would be sufficient? Shall I additionally say that read() isn't supported because there is no use case for it as of now or would it be obvious?
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++
There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
Would the following be ok in the doc?
When the capability KVM_CAP_GUEST_MEMFD_WRITE is supported, the 'flags' field supports GUEST_MEMFD_FLAG_WRITE. Setting this flag on guest_memfd creation enables write() syscall operations to populate guest_memfd memory from host userspace.
When a write() operation is performed on a guest_memfd file descriptor with the GUEST_MEMFD_FLAG_WRITE set, the syscall will populate the guest memory with user-supplied data in a generic way, without any vendor-specific preparation. The write operation is only supported for non-CoCo (Confidential Computing) setups where guest memory is not hardware-encrypted. If the write request is not page-aligned, any remaining bytes within the page are initialized to zero.
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
write() shall not attempt to access a page that is not in the direct map, which I believe can be achieved via kvm_kmem_gmem_write_begin() consulting the KVM_GMEM_FOLIO_NO_DIRECT_MAP in folio->private (introduced in [1]).
Do you think we should mention it in the commit message in some way? What particular constraint are you cautious about?
[1] https://lore.kernel.org/kvm/20250924152214.7292-2-roypat@amazon.co.uk/
1 file changed, 48 insertions(+)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..f4e218049afa 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -380,6 +380,8 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
static struct file_operations kvm_gmem_fops = { .mmap = kvm_gmem_mmap,
.llseek = default_llseek,.write_iter = generic_perform_write, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate,@@ -390,6 +392,49 @@ void kvm_gmem_init(struct module *module) kvm_gmem_fops.owner = module; }
+static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb,
struct address_space *mapping,loff_t pos, unsigned int len,struct folio **foliop,void **fsdata)Over-aggressive wrapping, this can be
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata)
or
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata)
if we want to bundle pos+len.
Ack.
+{
struct file *file = kiocb->ki_filp;ki_filp is already a file, and even if it were a "void *", there's no need for a local variable.
Ack.
struct inode *inode = file_inode(file);pgoff_t index = pos >> PAGE_SHIFT;struct folio *folio;if (!kvm_gmem_supports_mmap(inode))Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
As in my reply to the comment about doc, I plan to introduce KVM_CAP_GUEST_MEMFD_WRITE and GUEST_MEMFD_FLAG_WRITE. The kvm_arch_supports_gmem_write() will be a weak symbol and relying on !kvm_arch_has_private_mem() on x86, similar to kvm_arch_supports_gmem_mmap(). Does it look right?
return -ENODEV;if (pos + len > i_size_read(inode))return -EINVAL;folio = kvm_gmem_get_folio(inode, index);Eh, since "index" is only used once, my vote is to use "pos" and do the shift here, so that it's obvious that the input to kvm_gmem_get_folio() is being checked.
Ack.
if (IS_ERR(folio))return -EFAULT;Why EFAULT?
Will propagate the error like you suggest below.
*foliop = folio;There shouldn't be any need for a local "folio". What about having the "out" param be just "folio"?
E.g.
static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **folio, void **fsdata) { struct inode *inode = file_inode(kiocb->ki_filp);
if (!kvm_gmem_supports_write(inode)) return -ENODEV; if (pos + len > i_size_read(inode)) return -EINVAL; *folio = kvm_gmem_get_folio(inode, pos >> PAGE_SHIFT); if (IS_ERR(*folio)) return PTR_ERR(*folio); return 0;}
Ack.
return 0;+}
+static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb,
struct address_space *mapping,loff_t pos, unsigned int len,unsigned int copied,struct folio *folio, void *fsdata)+{
if (copied && copied < len) {Why check if "copied" is non-zero? I don't see why KVM should behave differently with respect to unwritten bytes if copy_folio_from_iter_atomic() fails on the first byte or the Nth byte.
No, I don't think there is a need for this check indeed. It looks like a leftover from my previous changes.
unsigned int from = pos & ((1UL << folio_order(folio)) - 1);Uh, isn't this just offset_in_folio()?
folio_zero_range(folio, from + copied, len - copied);I'd probably be in favor of omitting "from" entirely, e.g.
if (copied < len) folio_zero_range(folio, offset_in_folio(pos) + copied, len - copied);
Ack.
}folio_unlock(folio);folio_put(folio);return copied;+}
On Fri, Oct 24, 2025, Nikita Kalyazin wrote:
On 23/10/2025 17:07, Sean Christopherson wrote:
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
- Vishal and Ackerley
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Hi Sean,
Thanks for the review.
Do you think including the explanation from the cover letter would be sufficient?
It's pretty close. A few more details would be helpful, e.g. to explain that VMMs may use write() to populate the initial image
Shall I additionally say that read() isn't supported because there is no use case for it as of now or would it be obvious?
Hmm, I think if you want to exclude read() support, the changelog should explicitly state why. E.g. "there's no use case" is quite different from "deliberately don't support read() for security reasons".
Signed-off-by: Nikitia Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++
There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
Would the following be ok in the doc?
When the capability KVM_CAP_GUEST_MEMFD_WRITE is supported, the 'flags'
No capability is necessary, see d2042d8f96dd ("KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS").
field supports GUEST_MEMFD_FLAG_WRITE. Setting this flag on guest_memfd creation enables write() syscall operations to populate guest_memfd memory from host userspace.
When a write() operation is performed on a guest_memfd file descriptor with the GUEST_MEMFD_FLAG_WRITE set, the syscall will populate the guest memory with user-supplied data in a generic way, without any vendor-specific preparation. The write operation is only supported for non-CoCo (Confidential Computing) setups where guest memory is not hardware-encrypted.
The restriction should be that guest memory must be SHARED, i.e. not PRIVATE. Strictly speaking, guest memory can be encrypted, e.g. with SME and TME (I think TME is still a thing?), but with a shared key and thus accessible from the host.
Even if that weren't the case, we want to support this for CoCo VMs.
If the write request is not page-aligned, any remaining bytes within the page are initialized to zero.
Why? (Honest question, e.g. is that standard file semantics?)
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
write() shall not attempt to access a page that is not in the direct map, which I believe can be achieved via kvm_kmem_gmem_write_begin() consulting the KVM_GMEM_FOLIO_NO_DIRECT_MAP in folio->private (introduced in [1]).
Do you think we should mention it in the commit message in some way? What particular constraint are you cautious about?
I want to be cautious with respect to the ABI/uAPI. Patrick's series also adds a flag, and guest_memfd doesn't currently provide a way to toggle flags after the file is created. That begs the question of how GUEST_MEMFD_FLAG_NO_DIRECT_MAP will co-exist with GUEST_MEMFD_FLAG_WRITE. Presumably the goal is to use write() to initialize memory, and _then_ nuke the direct map.
I want line of sight to understanding the exact semantics/flows. E.g. will KVM require userspace to clear GUEST_MEMFD_FLAG_WRITE before allowing NO_DIRECT_MAP? Or will the write() simply fail? How will the sequencing be achieved?
struct inode *inode = file_inode(file);pgoff_t index = pos >> PAGE_SHIFT;struct folio *folio;if (!kvm_gmem_supports_mmap(inode))Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
As in my reply to the comment about doc, I plan to introduce KVM_CAP_GUEST_MEMFD_WRITE and GUEST_MEMFD_FLAG_WRITE. The kvm_arch_supports_gmem_write() will be a weak symbol and relying on !kvm_arch_has_private_mem() on x86, similar to kvm_arch_supports_gmem_mmap(). Does it look right?
No. As above, write() should be allowed iff memory is SHARED. Relevant commits that are now in Linus' tree:
44c6cb9fe9888b371e31165b2854bd0f4e2787d4 KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory 9aef71c892a55e004419923ba7129abe3e58d9f1 KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER 5d3341d684be80892d8f6f9812f90f9274b81177 KVM: guest_memfd: Invalidate SHARED GPAs if gmem supports INIT_SHARED
On 30/10/2025 21:37, Sean Christopherson wrote:
On Fri, Oct 24, 2025, Nikita Kalyazin wrote:
On 23/10/2025 17:07, Sean Christopherson wrote:
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
- Vishal and Ackerley
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Hi Sean,
Thanks for the review.
Do you think including the explanation from the cover letter would be sufficient?
It's pretty close. A few more details would be helpful, e.g. to explain that VMMs may use write() to populate the initial image
Ack.
Shall I additionally say that read() isn't supported because there is no use case for it as of now or would it be obvious?
Hmm, I think if you want to exclude read() support, the changelog should explicitly state why. E.g. "there's no use case" is quite different from "deliberately don't support read() for security reasons".
Ack.
Signed-off-by: Nikitia Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++
There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
Would the following be ok in the doc?
When the capability KVM_CAP_GUEST_MEMFD_WRITE is supported, the 'flags'
No capability is necessary, see d2042d8f96dd ("KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS").
Thanks, I didn't realise that kvm/next was behind kvm/master.
field supports GUEST_MEMFD_FLAG_WRITE. Setting this flag on guest_memfd creation enables write() syscall operations to populate guest_memfd memory from host userspace.
When a write() operation is performed on a guest_memfd file descriptor with the GUEST_MEMFD_FLAG_WRITE set, the syscall will populate the guest memory with user-supplied data in a generic way, without any vendor-specific preparation. The write operation is only supported for non-CoCo (Confidential Computing) setups where guest memory is not hardware-encrypted.
The restriction should be that guest memory must be SHARED, i.e. not PRIVATE. Strictly speaking, guest memory can be encrypted, e.g. with SME and TME (I think TME is still a thing?), but with a shared key and thus accessible from the host.
Even if that weren't the case, we want to support this for CoCo VMs.
To clarify, should it depend on GUEST_MEMFD_FLAG_INIT_SHARED for now?
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..5fbf65f49586 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -736,7 +736,7 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) u64 flags = GUEST_MEMFD_FLAG_MMAP;
if (!kvm || kvm_arch_supports_gmem_init_shared(kvm)) - flags |= GUEST_MEMFD_FLAG_INIT_SHARED; + flags |= GUEST_MEMFD_FLAG_INIT_SHARED | GUEST_MEMFD_FLAG_WRITE;
return flags; }
If the write request is not page-aligned, any remaining bytes within the page are initialized to zero.
Why? (Honest question, e.g. is that standard file semantics?)
The clause was originally suggested by James in v5 [1]. The behaviour shouldn't be deviating from the standard semantics though, so I will omit it. Moreover, when looking at the shmem implementation, I realised that I hadn't handled the case of clearing bytes _before_ written bytes properly. I will fix it in the next version.
[1] https://lore.kernel.org/kvm/CADrL8HUObfEd80sr783dB3dPWGSX7H5=0HCp9OjiL6D_Sp+...
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
write() shall not attempt to access a page that is not in the direct map, which I believe can be achieved via kvm_kmem_gmem_write_begin() consulting the KVM_GMEM_FOLIO_NO_DIRECT_MAP in folio->private (introduced in [1]).
Do you think we should mention it in the commit message in some way? What particular constraint are you cautious about?
I want to be cautious with respect to the ABI/uAPI. Patrick's series also adds a flag, and guest_memfd doesn't currently provide a way to toggle flags after the file is created. That begs the question of how GUEST_MEMFD_FLAG_NO_DIRECT_MAP will co-exist with GUEST_MEMFD_FLAG_WRITE. Presumably the goal is to use write() to initialize memory, and _then_ nuke the direct map.
I want line of sight to understanding the exact semantics/flows. E.g. will KVM require userspace to clear GUEST_MEMFD_FLAG_WRITE before allowing NO_DIRECT_MAP? Or will the write() simply fail? How will the sequencing be achieved?
No, I don't think we can clear the GUEST_MEMFD_FLAG_WRITE as we expect faults and writes to different pages to be arriving interspersed: some pages will be populated by write() proactively, some will be allocated by faults in the user mapping on demand. Both write() and the fault handler, if they need to allocate a page, will be writing content to it and "sealing" by removing it from the direct map. If write() faces an already "sealed" page, it will fail (with EEXIST [1]).
struct inode *inode = file_inode(file);pgoff_t index = pos >> PAGE_SHIFT;struct folio *folio;if (!kvm_gmem_supports_mmap(inode))Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
As in my reply to the comment about doc, I plan to introduce KVM_CAP_GUEST_MEMFD_WRITE and GUEST_MEMFD_FLAG_WRITE. The kvm_arch_supports_gmem_write() will be a weak symbol and relying on !kvm_arch_has_private_mem() on x86, similar to kvm_arch_supports_gmem_mmap(). Does it look right?
No. As above, write() should be allowed iff memory is SHARED. Relevant commits that are now in Linus' tree:
44c6cb9fe9888b371e31165b2854bd0f4e2787d4 KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory 9aef71c892a55e004419923ba7129abe3e58d9f1 KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER 5d3341d684be80892d8f6f9812f90f9274b81177 KVM: guest_memfd: Invalidate SHARED GPAs if gmem supports INIT_SHARED
Ack.
Nikita Kalyazin kalyazin@amazon.com writes:
On 30/10/2025 21:37, Sean Christopherson wrote:
On Fri, Oct 24, 2025, Nikita Kalyazin wrote:
On 23/10/2025 17:07, Sean Christopherson wrote:
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
- Vishal and Ackerley
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Hi Sean,
Thanks for the review.
Do you think including the explanation from the cover letter would be sufficient?
It's pretty close. A few more details would be helpful, e.g. to explain that VMMs may use write() to populate the initial image
Ack.
Shall I additionally say that read() isn't supported because there is no use case for it as of now or would it be obvious?
Hmm, I think if you want to exclude read() support, the changelog should explicitly state why. E.g. "there's no use case" is quite different from "deliberately don't support read() for security reasons".
Ack.
Signed-off-by: Nikitia Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++
There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
Would the following be ok in the doc?
When the capability KVM_CAP_GUEST_MEMFD_WRITE is supported, the 'flags'
No capability is necessary, see d2042d8f96dd ("KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS").
Thanks, I didn't realise that kvm/next was behind kvm/master.
field supports GUEST_MEMFD_FLAG_WRITE. Setting this flag on guest_memfd creation enables write() syscall operations to populate guest_memfd memory from host userspace.
When a write() operation is performed on a guest_memfd file descriptor with the GUEST_MEMFD_FLAG_WRITE set, the syscall will populate the guest memory with user-supplied data in a generic way, without any vendor-specific preparation. The write operation is only supported for non-CoCo (Confidential Computing) setups where guest memory is not hardware-encrypted.
The restriction should be that guest memory must be SHARED, i.e. not PRIVATE. Strictly speaking, guest memory can be encrypted, e.g. with SME and TME (I think TME is still a thing?), but with a shared key and thus accessible from the host.
Even if that weren't the case, we want to support this for CoCo VMs.
To clarify, should it depend on GUEST_MEMFD_FLAG_INIT_SHARED for now?
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..5fbf65f49586 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -736,7 +736,7 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) u64 flags = GUEST_MEMFD_FLAG_MMAP;
if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
flags |= GUEST_MEMFD_FLAG_INIT_SHARED |GUEST_MEMFD_FLAG_WRITE;
return flags;}
Yup! It should depend on GUEST_MEMFD_FLAG_INIT_SHARED for now.
When conversion is supported then GUEST_MEMFD_FLAG_WRITE can always be a supported flag, and the shared/private check can then be shifted to .write_begin().
If the write request is not page-aligned, any remaining bytes within the page are initialized to zero.
Why? (Honest question, e.g. is that standard file semantics?)
The clause was originally suggested by James in v5 [1]. The behaviour shouldn't be deviating from the standard semantics though, so I will omit it. Moreover, when looking at the shmem implementation, I realised that I hadn't handled the case of clearing bytes _before_ written bytes properly. I will fix it in the next version.
Was thinking about this a bit more. One way to think about this is that we could have more flexibility: Do we need to zero the parts of the page that were not written to? Maybe the user wanted to write only to byte offsets 10 to 20 within the page, and keep the rest intact? That should still be okay I think, there's no data leak since we're not allowing read().
Looking ahead to conversions on huge pages, I think I prefer being more restrictive though. For 4K pages we don't allow allocations or truncations that are not PAGE_ALIGNED. For huge pages the current stance is to return -EINVAL for allocations/truncations that are not huge page size aligned.
If we allow non-page-aligned writes, handling huge pages could be complicated. I don't see any way for guest_memfd to tell the caller generic_perform_write() function that it can only write to certain parts of a page. This means that if a huge page has mixed shared/private status, guest_memfd would have to split the folio just for generic_perform_write() to not write elsewhere.
Do you have any ideas here? Or maybe we will then use some other .write_iter() function.
My proposal is to impose a restriction that write()s offset/len must be PAGE_ALIGNED, we can check that in .write_begin(). Being more restrictive could be a good starting point that allows us to relax the constraint later. (Unless you already have a use case that requires writing less than a page)
That should also solve the zeroing issue, if the user must always provide full pages worth of data to write. :P
[1] https://lore.kernel.org/kvm/CADrL8HUObfEd80sr783dB3dPWGSX7H5=0HCp9OjiL6D_Sp+...
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
write() shall not attempt to access a page that is not in the direct map, which I believe can be achieved via kvm_kmem_gmem_write_begin() consulting
Btw why not just kvm_gmem_write_begin() instead of having the additional kmem part in the name?
the KVM_GMEM_FOLIO_NO_DIRECT_MAP in folio->private (introduced in [1]).
Do you think we should mention it in the commit message in some way? What particular constraint are you cautious about?
I want to be cautious with respect to the ABI/uAPI. Patrick's series also adds a flag, and guest_memfd doesn't currently provide a way to toggle flags after the file is created. That begs the question of how GUEST_MEMFD_FLAG_NO_DIRECT_MAP will co-exist with GUEST_MEMFD_FLAG_WRITE. Presumably the goal is to use write() to initialize memory, and _then_ nuke the direct map.
I want line of sight to understanding the exact semantics/flows. E.g. will KVM require userspace to clear GUEST_MEMFD_FLAG_WRITE before allowing NO_DIRECT_MAP? Or will the write() simply fail? How will the sequencing be achieved?
No, I don't think we can clear the GUEST_MEMFD_FLAG_WRITE as we expect faults and writes to different pages to be arriving interspersed: some pages will be populated by write() proactively, some will be allocated by faults in the user mapping on demand. Both write() and the fault handler, if they need to allocate a page, will be writing content to it and "sealing" by removing it from the direct map. If write() faces an already "sealed" page, it will fail (with EEXIST [1]).
IIUC this means a write() after fallocate(), or any unintended access to the memory before write, for a GUEST_MEMFD_FLAG_NO_DIRECT_MAP guest_memfd will necessarily fail.
The required ordering is kind of awkward, but I don't really have any good suggestions.
struct inode *inode = file_inode(file);pgoff_t index = pos >> PAGE_SHIFT;struct folio *folio;if (!kvm_gmem_supports_mmap(inode))Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
As in my reply to the comment about doc, I plan to introduce KVM_CAP_GUEST_MEMFD_WRITE and GUEST_MEMFD_FLAG_WRITE. The kvm_arch_supports_gmem_write() will be a weak symbol and relying on !kvm_arch_has_private_mem() on x86, similar to kvm_arch_supports_gmem_mmap(). Does it look right?
No. As above, write() should be allowed iff memory is SHARED. Relevant commits that are now in Linus' tree:
44c6cb9fe9888b371e31165b2854bd0f4e2787d4 KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory 9aef71c892a55e004419923ba7129abe3e58d9f1 KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER 5d3341d684be80892d8f6f9812f90f9274b81177 KVM: guest_memfd: Invalidate SHARED GPAs if gmem supports INIT_SHARED
Ack.
On 07/11/2025 15:42, Ackerley Tng wrote:
Nikita Kalyazin kalyazin@amazon.com writes:
On 30/10/2025 21:37, Sean Christopherson wrote:
On Fri, Oct 24, 2025, Nikita Kalyazin wrote:
On 23/10/2025 17:07, Sean Christopherson wrote:
On Mon, Oct 20, 2025, Nikita Kalyazin wrote:
From: Nikita Kalyazin kalyazin@amazon.com
- Vishal and Ackerley
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. If the request is not page-aligned, the remaining bytes are initialised to 0.
write is only supported for non-CoCo setups where guest memory is not hardware-encrypted.
Please include all of the "why". The code mostly communicates the "what", but it doesn't capture why write() support is at all interesting, nor does it explain why read() isn't supported.
Hi Sean,
Thanks for the review.
Do you think including the explanation from the cover letter would be sufficient?
It's pretty close. A few more details would be helpful, e.g. to explain that VMMs may use write() to populate the initial image
Ack.
Shall I additionally say that read() isn't supported because there is no use case for it as of now or would it be obvious?
Hmm, I think if you want to exclude read() support, the changelog should explicitly state why. E.g. "there's no use case" is quite different from "deliberately don't support read() for security reasons".
Ack.
Signed-off-by: Nikitia Kalyazin kalyazin@amazon.com
virt/kvm/guest_memfd.c | 48 ++++++++++++++++++++++++++++++++++++++++++There's a notable lack of uAPI and Documentation chanegs. I.e. this needs a GUEST_MEMFD_FLAG_xxx along with proper documentation.
Would the following be ok in the doc?
When the capability KVM_CAP_GUEST_MEMFD_WRITE is supported, the 'flags'
No capability is necessary, see d2042d8f96dd ("KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS").
Thanks, I didn't realise that kvm/next was behind kvm/master.
field supports GUEST_MEMFD_FLAG_WRITE. Setting this flag on guest_memfd creation enables write() syscall operations to populate guest_memfd memory from host userspace.
When a write() operation is performed on a guest_memfd file descriptor with the GUEST_MEMFD_FLAG_WRITE set, the syscall will populate the guest memory with user-supplied data in a generic way, without any vendor-specific preparation. The write operation is only supported for non-CoCo (Confidential Computing) setups where guest memory is not hardware-encrypted.
The restriction should be that guest memory must be SHARED, i.e. not PRIVATE. Strictly speaking, guest memory can be encrypted, e.g. with SME and TME (I think TME is still a thing?), but with a shared key and thus accessible from the host.
Even if that weren't the case, we want to support this for CoCo VMs.
To clarify, should it depend on GUEST_MEMFD_FLAG_INIT_SHARED for now?
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..5fbf65f49586 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -736,7 +736,7 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) u64 flags = GUEST_MEMFD_FLAG_MMAP;
if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
flags |= GUEST_MEMFD_FLAG_INIT_SHARED |GUEST_MEMFD_FLAG_WRITE;
return flags;}
Yup! It should depend on GUEST_MEMFD_FLAG_INIT_SHARED for now.
When conversion is supported then GUEST_MEMFD_FLAG_WRITE can always be a supported flag, and the shared/private check can then be shifted to .write_begin().
Thanks for the confirmation.
If the write request is not page-aligned, any remaining bytes within the page are initialized to zero.
Why? (Honest question, e.g. is that standard file semantics?)
The clause was originally suggested by James in v5 [1]. The behaviour shouldn't be deviating from the standard semantics though, so I will omit it. Moreover, when looking at the shmem implementation, I realised that I hadn't handled the case of clearing bytes _before_ written bytes properly. I will fix it in the next version.
Was thinking about this a bit more. One way to think about this is that we could have more flexibility: Do we need to zero the parts of the page that were not written to? Maybe the user wanted to write only to byte offsets 10 to 20 within the page, and keep the rest intact? That should still be okay I think, there's no data leak since we're not allowing read().
Looking ahead to conversions on huge pages, I think I prefer being more restrictive though. For 4K pages we don't allow allocations or truncations that are not PAGE_ALIGNED. For huge pages the current stance is to return -EINVAL for allocations/truncations that are not huge page size aligned.
If we allow non-page-aligned writes, handling huge pages could be complicated. I don't see any way for guest_memfd to tell the caller generic_perform_write() function that it can only write to certain parts of a page. This means that if a huge page has mixed shared/private status, guest_memfd would have to split the folio just for generic_perform_write() to not write elsewhere.
Do you have any ideas here? Or maybe we will then use some other .write_iter() function.
My proposal is to impose a restriction that write()s offset/len must be PAGE_ALIGNED, we can check that in .write_begin(). Being more restrictive could be a good starting point that allows us to relax the constraint later. (Unless you already have a use case that requires writing less than a page)
That should also solve the zeroing issue, if the user must always provide full pages worth of data to write. :P
We do not have a use case for partial writes as we always write full pages so I am totally open to applying the restriction, especially if it helps huge page conversion.
[1] https://lore.kernel.org/kvm/CADrL8HUObfEd80sr783dB3dPWGSX7H5=0HCp9OjiL6D_Sp+...
And while it's definitely it's a-ok to land .write() in advance of the direct map changes, we do need to at least map out how we want the two to interact, e.g. so that we don't end up with constraints that are impossible to satisfy.
write() shall not attempt to access a page that is not in the direct map, which I believe can be achieved via kvm_kmem_gmem_write_begin() consulting
Btw why not just kvm_gmem_write_begin() instead of having the additional kmem part in the name?
You are right, no reason for an extra kmem in the name. Thanks!
the KVM_GMEM_FOLIO_NO_DIRECT_MAP in folio->private (introduced in [1]).
Do you think we should mention it in the commit message in some way? What particular constraint are you cautious about?
I want to be cautious with respect to the ABI/uAPI. Patrick's series also adds a flag, and guest_memfd doesn't currently provide a way to toggle flags after the file is created. That begs the question of how GUEST_MEMFD_FLAG_NO_DIRECT_MAP will co-exist with GUEST_MEMFD_FLAG_WRITE. Presumably the goal is to use write() to initialize memory, and _then_ nuke the direct map.
I want line of sight to understanding the exact semantics/flows. E.g. will KVM require userspace to clear GUEST_MEMFD_FLAG_WRITE before allowing NO_DIRECT_MAP? Or will the write() simply fail? How will the sequencing be achieved?
No, I don't think we can clear the GUEST_MEMFD_FLAG_WRITE as we expect faults and writes to different pages to be arriving interspersed: some pages will be populated by write() proactively, some will be allocated by faults in the user mapping on demand. Both write() and the fault handler, if they need to allocate a page, will be writing content to it and "sealing" by removing it from the direct map. If write() faces an already "sealed" page, it will fail (with EEXIST [1]).
IIUC this means a write() after fallocate(), or any unintended access to the memory before write, for a GUEST_MEMFD_FLAG_NO_DIRECT_MAP guest_memfd will necessarily fail.
The required ordering is kind of awkward, but I don't really have any good suggestions.
I don't think fallocate() needs to "seal" the page, since it doesn't initialise the page content. So it should be allowed to write to a page that has previously been fallocated, unless there is something I'm missing that makes it impossible.
struct inode *inode = file_inode(file);pgoff_t index = pos >> PAGE_SHIFT;struct folio *folio;if (!kvm_gmem_supports_mmap(inode))Checking for MMAP is neither sufficient nor strictly necessary. MMAP doesn't imply SHARED, and it's not clear to me that mmap() support should be in any way tied to WRITE support.
As in my reply to the comment about doc, I plan to introduce KVM_CAP_GUEST_MEMFD_WRITE and GUEST_MEMFD_FLAG_WRITE. The kvm_arch_supports_gmem_write() will be a weak symbol and relying on !kvm_arch_has_private_mem() on x86, similar to kvm_arch_supports_gmem_mmap(). Does it look right?
No. As above, write() should be allowed iff memory is SHARED. Relevant commits that are now in Linus' tree:
44c6cb9fe9888b371e31165b2854bd0f4e2787d4 KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory 9aef71c892a55e004419923ba7129abe3e58d9f1 KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER 5d3341d684be80892d8f6f9812f90f9274b81177 KVM: guest_memfd: Invalidate SHARED GPAs if gmem supports INIT_SHAREDAck.
From: Nikita Kalyazin kalyazin@amazon.com
This is to reflect that the write syscall is now implemented for guest_memfd.
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- .../testing/selftests/kvm/guest_memfd_test.c | 51 ++++++++++++++++--- 1 file changed, 45 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index b3ca6737f304..be1f78542d64 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -24,18 +24,55 @@ #include "test_util.h" #include "ucall_common.h"
-static void test_file_read_write(int fd) +static void test_file_read(int fd) { char buf[64];
TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, "read on a guest_mem fd should fail"); - TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, - "write on a guest_mem fd should fail"); TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, "pread on a guest_mem fd should fail"); - TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, - "pwrite on a guest_mem fd should fail"); +} + +static void test_write_supported(int fd, size_t total_size) +{ + size_t page_size = getpagesize(); + void *buf = NULL; + int ret; + + ret = posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + + ret = pwrite(fd, buf, page_size, total_size); + TEST_ASSERT(ret == -1, "writing past the file size on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "write on a guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + free(buf); +} + +static void test_write_not_supported(int fd, size_t total_size) +{ + size_t page_size = getpagesize(); + void *buf = NULL; + int ret; + + ret = posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == -1, "write on guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, ENODEV); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + free(buf); }
static void test_mmap_supported(int fd, size_t page_size, size_t total_size) @@ -281,12 +318,14 @@ static void test_guest_memfd(unsigned long vm_type)
fd = vm_create_guest_memfd(vm, total_size, flags);
- test_file_read_write(fd); + test_file_read(fd);
if (flags & GUEST_MEMFD_FLAG_MMAP) { + test_write_supported(fd, total_size); test_mmap_supported(fd, page_size, total_size); test_fault_overflow(fd, page_size, total_size); } else { + test_write_not_supported(fd, total_size); test_mmap_not_supported(fd, page_size, total_size); }
linux-kselftest-mirror@lists.linaro.org