On Wed, Feb 24, 2021 at 6:14 PM Axel Rasmussen axelrasmussen@google.com wrote:
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
fs/userfaultfd.c | 6 +-- include/linux/shmem_fs.h | 26 ++++------ include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +-- mm/shmem.c | 88 +++++++++++++++----------------- mm/userfaultfd.c | 27 +++++----- 6 files changed, 77 insertions(+), 82 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0311e9b8a8fc..aa6d584ae8c7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) {
/* FIXME: Add minor fault interception for shmem. */
if (!is_vm_hugetlb_page(vma))
if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; }
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr);
-#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
dst_addr) ({ BUG(); 0; })
-#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep);
+#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
src_addr, mode, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \
UFFD_FEATURE_MINOR_HUGETLBFS)
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls;
diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
} } ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c index 06c771d23127..d7847f6f696b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1781,8 +1780,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- vm. If we swap it in we mark it dirty since we also free the swap
- entry since a page cannot live in both the swap and page cache.
- vmf and fault_type are only supplied by shmem_fault:
- otherwise they are NULL.
- vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
*/
- are NULL.
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1826,6 +1825,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
if (page && vma && userfaultfd_minor(vma)) {
unlock_page(page);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
if (page) hindex = page->index; if (page && sgp == SGP_WRITE)
@@ -2350,14 +2355,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
bool zeropage,
struct page **pagep)
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping;
@@ -2374,12 +2377,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
if (!*pagep) {
if (is_continue) {
ret = -EFAULT;
page = find_get_page(mapping, pgoff);
if (!page)
goto out_unacct_blocks;
} else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
if (!zeropage) { /* mcopy_atomic */
if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
@@ -2393,7 +2401,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; }
} else { /* mfill_zeropage_atomic */
} else { /* zeropage */ clear_highpage(page); } } else {
@@ -2401,9 +2409,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; }
VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
if (!is_continue)
VM_BUG_ON(PageSwapBacked(page));
VM_BUG_ON(PageLocked(page)); __SetPageLocked(page);
__SetPageSwapBacked(page);
if (!is_continue || !PageSwapBacked(page))
__SetPageSwapBacked(page); __SetPageUptodate(page);
Apologies, I was testing more scenarios today and discovered this doesn't work when the shmem is backed by a tmpfs file with the huge=always mount option. I think it's more correct to lookup the page with find_lock_page, and then wrap *all* of the page flag fiddling here in an "if(!is_continue) {" block. I'll send a v2 with this fix next week.
ret = -EFAULT;
@@ -2412,10 +2423,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release;
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
/* If page wasn't already in the page cache, add it. */
if (!is_continue) {
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
} _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE)
@@ -2442,13 +2456,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock;
lru_cache_add(page);
if (!is_continue) {
lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
} inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false);
@@ -2473,28 +2489,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, goto out; }
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep)
-{
return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, false, pagep);
-}
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
-{
struct page *page = NULL;
return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, 0, true, &page);
-}
#ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce6cb4760d2c..6cd7ab531aec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page,
bool zeropage,
enum mcopy_atomic_mode mode, bool wp_copy)
{ ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
switch (mode) {
case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy);
else
break;
case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr);
break;
case MCOPY_ATOMIC_CONTINUE:
err = -EINVAL;
break;
} } else { VM_WARN_ON_ONCE(wp_copy);
if (!zeropage)
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
dst_vma, dst_addr,
src_addr, page);
else
err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, mode, page); } return err;
@@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy;
bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters:
@@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock;
if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock; /*
@@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, &page, zeropage, wp_copy);
src_addr, &page, mcopy_mode, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) {
-- 2.30.0.617.g56c4b15f3c-goog