First stage of hugetlb support: add initialization and cleanup routines.
After guest_mem was massaged to use guest_mem inodes instead of anonymous inodes in an earlier patch, the .evict_inode handler can now be overridden to do hugetlb metadata cleanup.
Signed-off-by: Ackerley Tng ackerleytng@google.com --- include/uapi/linux/kvm.h | 26 ++++++ virt/kvm/guest_memfd.c | 177 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 197 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 637efc055145..77de7c4432f6 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -13,6 +13,7 @@ #include <linux/compiler.h> #include <linux/ioctl.h> #include <asm/kvm.h> +#include <asm-generic/hugetlb_encode.h>
#define KVM_API_VERSION 12
@@ -1558,6 +1559,31 @@ struct kvm_memory_attributes {
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
+#define KVM_GUEST_MEMFD_HUGETLB (1ULL << 1) + +/* + * Huge page size encoding when KVM_GUEST_MEMFD_HUGETLB is specified, and a huge + * page size other than the default is desired. See hugetlb_encode.h. All + * known huge page size encodings are provided here. It is the responsibility + * of the application to know which sizes are supported on the running system. + * See mmap(2) man page for details. + */ +#define KVM_GUEST_MEMFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define KVM_GUEST_MEMFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define KVM_GUEST_MEMFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define KVM_GUEST_MEMFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define KVM_GUEST_MEMFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define KVM_GUEST_MEMFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define KVM_GUEST_MEMFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define KVM_GUEST_MEMFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define KVM_GUEST_MEMFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB +#define KVM_GUEST_MEMFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define KVM_GUEST_MEMFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB +#define KVM_GUEST_MEMFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define KVM_GUEST_MEMFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define KVM_GUEST_MEMFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + struct kvm_create_guest_memfd { __u64 size; __u64 flags; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5d7fd1f708a6..31e1115273e1 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -3,6 +3,7 @@ #include <linux/mount.h> #include <linux/backing-dev.h> #include <linux/falloc.h> +#include <linux/hugetlb.h> #include <linux/kvm_host.h> #include <linux/pseudo_fs.h> #include <linux/pagemap.h> @@ -18,6 +19,16 @@ struct kvm_gmem { struct list_head entry; };
+struct kvm_gmem_hugetlb { + struct hstate *h; + struct hugepage_subpool *spool; +}; + +static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode) +{ + return inode->i_mapping->i_private_data; +} + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -154,6 +165,82 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, } }
+static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio) +{ + folio_lock(folio); + + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); + + folio_unlock(folio); +} + +/** + * Removes folios in range [@lstart, @lend) from page cache/filemap (@mapping), + * returning the number of pages freed. + */ +static int kvm_gmem_hugetlb_filemap_remove_folios(struct address_space *mapping, + struct hstate *h, + loff_t lstart, loff_t lend) +{ + const pgoff_t end = lend >> PAGE_SHIFT; + pgoff_t next = lstart >> PAGE_SHIFT; + struct folio_batch fbatch; + int num_freed = 0; + + folio_batch_init(&fbatch); + while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { + int i; + for (i = 0; i < folio_batch_count(&fbatch); ++i) { + struct folio *folio; + pgoff_t hindex; + u32 hash; + + folio = fbatch.folios[i]; + hindex = folio->index >> huge_page_order(h); + hash = hugetlb_fault_mutex_hash(mapping, hindex); + + mutex_lock(&hugetlb_fault_mutex_table[hash]); + kvm_gmem_hugetlb_filemap_remove_folio(folio); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + num_freed++; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return num_freed; +} + +/** + * Removes folios in range [@lstart, @lend) from page cache of inode, updates + * inode metadata and hugetlb reservations. + */ +static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode, + loff_t lstart, loff_t lend) +{ + struct kvm_gmem_hugetlb *hgmem; + struct hstate *h; + int gbl_reserve; + int num_freed; + + hgmem = kvm_gmem_hgmem(inode); + h = hgmem->h; + + num_freed = kvm_gmem_hugetlb_filemap_remove_folios(inode->i_mapping, + h, lstart, lend); + + gbl_reserve = hugepage_subpool_put_pages(hgmem->spool, num_freed); + hugetlb_acct_memory(h, -gbl_reserve); + + spin_lock(&inode->i_lock); + inode->i_blocks -= blocks_per_huge_page(h) * num_freed; + spin_unlock(&inode->i_lock); +} + + static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; @@ -307,8 +394,33 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); }
+static void kvm_gmem_hugetlb_teardown(struct inode *inode) +{ + struct kvm_gmem_hugetlb *hgmem; + + truncate_inode_pages_final_prepare(inode->i_mapping); + kvm_gmem_hugetlb_truncate_folios_range(inode, 0, LLONG_MAX); + + hgmem = kvm_gmem_hgmem(inode); + hugepage_put_subpool(hgmem->spool); + kfree(hgmem); +} + +static void kvm_gmem_evict_inode(struct inode *inode) +{ + u64 flags = (u64)inode->i_private; + + if (flags & KVM_GUEST_MEMFD_HUGETLB) + kvm_gmem_hugetlb_teardown(inode); + else + truncate_inode_pages_final(inode->i_mapping); + + clear_inode(inode); +} + static const struct super_operations kvm_gmem_super_operations = { .statfs = simple_statfs, + .evict_inode = kvm_gmem_evict_inode, };
static int kvm_gmem_init_fs_context(struct fs_context *fc) @@ -431,6 +543,42 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, };
+static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags) +{ + struct kvm_gmem_hugetlb *hgmem; + struct hugepage_subpool *spool; + int page_size_log; + struct hstate *h; + long hpages; + + page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK; + h = hstate_sizelog(page_size_log); + + /* Round up to accommodate size requests that don't align with huge pages */ + hpages = round_up(size, huge_page_size(h)) >> huge_page_shift(h); + + spool = hugepage_new_subpool(h, hpages, hpages, false); + if (!spool) + goto err; + + hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL); + if (!hgmem) + goto err_subpool; + + inode->i_blkbits = huge_page_shift(h); + + hgmem->h = h; + hgmem->spool = spool; + inode->i_mapping->i_private_data = hgmem; + + return 0; + +err_subpool: + kfree(spool); +err: + return -ENOMEM; +} + static struct inode *kvm_gmem_inode_make_secure_inode(const char *name, loff_t size, u64 flags) { @@ -443,9 +591,13 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name, return inode;
err = security_inode_init_security_anon(inode, &qname, NULL); - if (err) { - iput(inode); - return ERR_PTR(err); + if (err) + goto out; + + if (flags & KVM_GUEST_MEMFD_HUGETLB) { + err = kvm_gmem_hugetlb_setup(inode, size, flags); + if (err) + goto out; }
inode->i_private = (void *)(unsigned long)flags; @@ -459,6 +611,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name, WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
return inode; + +out: + iput(inode); + + return ERR_PTR(err); }
static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size, @@ -526,14 +683,22 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) return err; }
+#define KVM_GUEST_MEMFD_ALL_FLAGS KVM_GUEST_MEMFD_HUGETLB + int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; - u64 valid_flags = 0;
- if (flags & ~valid_flags) - return -EINVAL; + if (flags & KVM_GUEST_MEMFD_HUGETLB) { + /* Allow huge page size encoding in flags */ + if (flags & ~(KVM_GUEST_MEMFD_ALL_FLAGS | + (KVM_GUEST_MEMFD_HUGE_MASK << KVM_GUEST_MEMFD_HUGE_SHIFT))) + return -EINVAL; + } else { + if (flags & ~KVM_GUEST_MEMFD_ALL_FLAGS) + return -EINVAL; + }
if (size <= 0 || !PAGE_ALIGNED(size)) return -EINVAL;