On Sat, Aug 20, 2022 at 10:15:32PM -0700, Hugh Dickins wrote:
I will try next week to rework it as shim to top of shmem. Does it work for you?
Yes, please do, thanks. It's a compromise between us: the initial TDX case has no justification to use shmem at all, but doing it that way will help you with some of the infrastructure, and will probably be easiest for KVM to extend to other more relaxed fd cases later.
Okay, below is my take on the shim approach.
I don't hate how it turned out. It is easier to understand without callback exchange thing.
The only caveat is I had to introduce external lock to protect against race between lookup and truncate. Otherwise, looks pretty reasonable to me.
I did very limited testing. And it lacks integration with KVM, but API changed not substantially, any it should be easy to adopt.
Any comments?
diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 4f1600413f91..aec04a0f8b7b 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -3,6 +3,7 @@ #define __LINUX_MEMFD_H
#include <linux/file.h> +#include <linux/pfn_t.h>
#ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); @@ -13,4 +14,27 @@ static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) } #endif
+struct inaccessible_notifier; + +struct inaccessible_notifier_ops { + void (*invalidate)(struct inaccessible_notifier *notifier, + pgoff_t start, pgoff_t end); +}; + +struct inaccessible_notifier { + struct list_head list; + const struct inaccessible_notifier_ops *ops; +}; + +int inaccessible_register_notifier(struct file *file, + struct inaccessible_notifier *notifier); +void inaccessible_unregister_notifier(struct file *file, + struct inaccessible_notifier *notifier); + +int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn, + int *order); +void inaccessible_put_pfn(struct file *file, pfn_t pfn); + +struct file *memfd_mkinaccessible(struct file *memfd); + #endif /* __LINUX_MEMFD_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..9d066be3d7e8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define INACCESSIBLE_MAGIC 0x494e4143 /* "INAC" */
#endif /* __LINUX_MAGIC_H__ */ diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..48750474b904 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,7 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +#define MFD_INACCESSIBLE 0x0008U
/* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/mm/Makefile b/mm/Makefile index 9a564f836403..f82e5d4b4388 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,7 +126,7 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o -obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o memfd_inaccessible.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o diff --git a/mm/memfd.c b/mm/memfd.c index 08f5f8304746..1853a90f49ff 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -261,7 +261,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \ + MFD_INACCESSIBLE)
SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -283,6 +284,14 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; }
+ /* Disallow sealing when MFD_INACCESSIBLE is set. */ + if ((flags & MFD_INACCESSIBLE) && (flags & MFD_ALLOW_SEALING)) + return -EINVAL; + + /* TODO: add hugetlb support */ + if ((flags & MFD_INACCESSIBLE) && (flags & MFD_HUGETLB)) + return -EINVAL; + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -331,10 +340,24 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; }
+ if (flags & MFD_INACCESSIBLE) { + struct file *inaccessible_file; + + inaccessible_file = memfd_mkinaccessible(file); + if (IS_ERR(inaccessible_file)) { + error = PTR_ERR(inaccessible_file); + goto err_file; + } + + file = inaccessible_file; + } + fd_install(fd, file); kfree(name); return fd;
+err_file: + fput(file); err_fd: put_unused_fd(fd); err_name: diff --git a/mm/memfd_inaccessible.c b/mm/memfd_inaccessible.c new file mode 100644 index 000000000000..89194438af9c --- /dev/null +++ b/mm/memfd_inaccessible.c @@ -0,0 +1,234 @@ +#include <linux/memfd.h> +#include <linux/pagemap.h> +#include <linux/pseudo_fs.h> +#include <linux/shmem_fs.h> +#include <uapi/linux/falloc.h> +#include <uapi/linux/magic.h> + +struct inaccessible_data { + struct rw_semaphore lock; + struct file *memfd; + struct list_head notifiers; +}; + +static void inaccessible_notifier_invalidate(struct inaccessible_data *data, + pgoff_t start, pgoff_t end) +{ + struct inaccessible_notifier *notifier; + + lockdep_assert_held(&data->lock); + VM_BUG_ON(!rwsem_is_locked(&data->lock)); + + list_for_each_entry(notifier, &data->notifiers, list) { + notifier->ops->invalidate(notifier, start, end); + } +} + +static int inaccessible_release(struct inode *inode, struct file *file) +{ + struct inaccessible_data *data = inode->i_mapping->private_data; + + fput(data->memfd); + kfree(data); + return 0; +} + +static long inaccessible_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + struct file *memfd = data->memfd; + int ret; + + /* The lock prevents parallel inaccessible_get/put_pfn() */ + down_write(&data->lock); + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) { + ret = -EINVAL; + goto out; + } + } + + ret = memfd->f_op->fallocate(memfd, mode, offset, len); + inaccessible_notifier_invalidate(data, offset, offset + len); +out: + up_write(&data->lock); + return ret; +} + +static const struct file_operations inaccessible_fops = { + .release = inaccessible_release, + .fallocate = inaccessible_fallocate, +}; + +static int inaccessible_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct inaccessible_data *data = inode->i_mapping->private_data; + struct file *memfd = data->memfd; + + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat, + request_mask, query_flags); +} + +static int inaccessible_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct inaccessible_data *data = inode->i_mapping->private_data; + struct file *memfd = data->memfd; + int ret; + + if (attr->ia_valid & ATTR_SIZE) { + if (memfd->f_inode->i_size) { + ret = -EPERM; + goto out; + } + + if (!PAGE_ALIGNED(attr->ia_size)) { + ret = -EINVAL; + goto out; + } + } + + ret = memfd->f_inode->i_op->setattr(mnt_userns, + file_dentry(memfd), attr); +out: + return ret; +} + +static const struct inode_operations inaccessible_iops = { + .getattr = inaccessible_getattr, + .setattr = inaccessible_setattr, +}; + +static int inaccessible_init_fs_context(struct fs_context *fc) +{ + if (!init_pseudo(fc, INACCESSIBLE_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + return 0; +} + +static struct file_system_type inaccessible_fs = { + .owner = THIS_MODULE, + .name = "[inaccessible]", + .init_fs_context = inaccessible_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *inaccessible_mnt; + +static __init int inaccessible_init(void) +{ + inaccessible_mnt = kern_mount(&inaccessible_fs); + if (IS_ERR(inaccessible_mnt)) + return PTR_ERR(inaccessible_mnt); + return 0; +} +fs_initcall(inaccessible_init); + +struct file *memfd_mkinaccessible(struct file *memfd) +{ + struct inaccessible_data *data; + struct address_space *mapping; + struct inode *inode; + struct file *file; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + data->memfd = memfd; + init_rwsem(&data->lock); + INIT_LIST_HEAD(&data->notifiers); + + inode = alloc_anon_inode(inaccessible_mnt->mnt_sb); + if (IS_ERR(inode)) { + kfree(data); + return ERR_CAST(inode); + } + + inode->i_mode |= S_IFREG; + inode->i_op = &inaccessible_iops; + inode->i_mapping->private_data = data; + + file = alloc_file_pseudo(inode, inaccessible_mnt, + "[memfd:inaccessible]", O_RDWR, + &inaccessible_fops); + if (IS_ERR(file)) { + iput(inode); + kfree(data); + } + + mapping = memfd->f_mapping; + mapping_set_unevictable(mapping); + mapping_set_gfp_mask(mapping, + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE); + + return file; +} + +int inaccessible_register_notifier(struct file *file, + struct inaccessible_notifier *notifier) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + + down_write(&data->lock); + list_add(¬ifier->list, &data->notifiers); + up_write(&data->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(inaccessible_register_notifier); + +void inaccessible_unregister_notifier(struct file *file, + struct inaccessible_notifier *notifier) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + + down_write(&data->lock); + list_del_rcu(¬ifier->list); + up_write(&data->lock); +} +EXPORT_SYMBOL_GPL(inaccessible_unregister_notifier); + +int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn, + int *order) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + struct file *memfd = data->memfd; + struct page *page; + int ret; + + down_read(&data->lock); + + ret = shmem_getpage(file_inode(memfd), offset, &page, SGP_WRITE); + if (ret) { + up_read(&data->lock); + return ret; + } + + *pfn = page_to_pfn_t(page); + *order = thp_order(compound_head(page)); + return 0; +} +EXPORT_SYMBOL_GPL(inaccessible_get_pfn); + +void inaccessible_put_pfn(struct file *file, pfn_t pfn) +{ + struct page *page = pfn_t_to_page(pfn); + struct inaccessible_data *data = file->f_mapping->private_data; + + if (WARN_ON_ONCE(!page)) + return; + + SetPageUptodate(page); + unlock_page(page); + put_page(page); + up_read(&data->lock); +} +EXPORT_SYMBOL_GPL(inaccessible_put_pfn);