From: Nikita Kalyazin kalyazin@amazon.com
On systems that support shared guest memory, write() is useful, for example, for population of the initial image. Even though the same can also be achieved via userspace mapping and memcpying from userspace, write() provides a more performant option because it does not need to set user page tables and it does not cause a page fault for every page like memcpy would. Note that memcpy cannot be accelerated via MADV_POPULATE_WRITE as it is not supported by guest_memfd and relies on GUP.
Populating 512MiB of guest_memfd on a x86 machine: - via memcpy: 436 ms - via write: 202 ms (-54%)
Only PAGE_ALIGNED offset and len are allowed. Even though non-aligned writes are technically possible, when in-place conversion support is implemented [1], the restriction makes handling of mixed shared/private huge pages simpler. write() will only be allowed to populate shared pages.
When direct map removal is implemented [2] - write() will not be allowed to access pages that have already been removed from direct map - on completion, write() will remove the populated pages from direct map
While it is technically possible to implement read() syscall on systems with shared guest memory, it is not supported as there is currently no use case for it.
[1] https://lore.kernel.org/kvm/cover.1760731772.git.ackerleytng@google.com [2] https://lore.kernel.org/kvm/20250924151101.2225820-1-patrick.roy@campus.lmu....
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- Documentation/virt/kvm/api.rst | 2 ++ include/linux/kvm_host.h | 2 +- include/uapi/linux/kvm.h | 1 + virt/kvm/guest_memfd.c | 52 ++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..9541e95fc2ed 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6448,6 +6448,8 @@ specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: without INIT_SHARED will be marked private). Shared memory can be faulted into host userspace page tables. Private memory cannot. + GUEST_MEMFD_FLAG_WRITE Enable using write() on the guest_memfd file + descriptor. ============================ ================================================
When the KVM MMU performs a PFN lookup to service a guest fault and the backing diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..5fbf65f49586 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -736,7 +736,7 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) u64 flags = GUEST_MEMFD_FLAG_MMAP;
if (!kvm || kvm_arch_supports_gmem_init_shared(kvm)) - flags |= GUEST_MEMFD_FLAG_INIT_SHARED; + flags |= GUEST_MEMFD_FLAG_INIT_SHARED | GUEST_MEMFD_FLAG_WRITE;
return flags; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..5b73d6528f1c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1601,6 +1601,7 @@ struct kvm_memory_attributes { #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) #define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) #define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) +#define GUEST_MEMFD_FLAG_WRITE (1ULL << 2)
struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ffadc5ee8e04..2c71c21b9189 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -411,6 +411,8 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
static struct file_operations kvm_gmem_fops = { .mmap = kvm_gmem_mmap, + .llseek = default_llseek, + .write_iter = generic_perform_write, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -421,6 +423,53 @@ void kvm_gmem_init(struct module *module) kvm_gmem_fops.owner = module; }
+static bool kvm_gmem_supports_write(struct inode *inode) +{ + const u64 flags = (u64)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_WRITE; +} + +static int kvm_gmem_write_begin(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + struct folio **folio, void **fsdata) +{ + struct inode *inode = file_inode(kiocb->ki_filp); + + if (!kvm_gmem_supports_write(inode)) + return -ENODEV; + + if (pos + len > i_size_read(inode)) + return -EINVAL; + + if (!IS_ALIGNED(pos, PAGE_SIZE) || !IS_ALIGNED(len, PAGE_SIZE)) + return -EINVAL; + + *folio = kvm_gmem_get_folio(inode, pos >> PAGE_SHIFT); + if (IS_ERR(*folio)) + return PTR_ERR(*folio); + + return 0; +} + +static int kvm_gmem_write_end(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + unsigned int copied, + struct folio *folio, void *fsdata) +{ + if (!folio_test_uptodate(folio)) { + folio_zero_range(folio, copied, len - copied); + folio_mark_uptodate(folio); + } + + folio_unlock(folio); + folio_put(folio); + + return copied; +} + static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -469,6 +518,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, + .write_begin = kvm_gmem_write_begin, + .write_end = kvm_gmem_write_end, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE @@ -516,6 +567,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) }
file->f_flags |= O_LARGEFILE; + file->f_mode |= FMODE_LSEEK | FMODE_PWRITE;
inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping);