On 8/3/23 8:08 PM, Andrei Vagin wrote:
On Thu, Jul 27, 2023 at 02:36:34PM +0500, Muhammad Usama Anjum wrote:
<snip>
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
unsigned long addr, unsigned long end,
unsigned long end_addr)
It hard to figure out what difference between end and end_addr. I would add a comment here.
I'll fix these in next version.
+{
- struct page_region *cur_buf = &p->cur_buf;
- if (cur_buf->start != addr)
cur_buf->end = addr;
- else
cur_buf->start = cur_buf->end = 0;
- p->end_addr = end_addr;
- p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+static int pagemap_scan_output(unsigned long categories,
struct pagemap_scan_private *p,
unsigned long addr, unsigned long *end)
+{
- unsigned long n_pages, total_pages;
- int ret = 0;
- if (!pagemap_scan_is_interesting_page(categories, p)) {
*end = addr;
return 0;
- }
- if (!p->vec_buf)
return 0;
- categories &= p->arg.return_mask;
- n_pages = (*end - addr) / PAGE_SIZE;
- if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
total_pages > p->arg.max_pages) {
why do we need to use check_add_overflow here?
size_t n_too_much = total_pages - p->arg.max_pages;
it is unsafe to use total_pages if check_add_overflow returns non-zero.
*end -= n_too_much * PAGE_SIZE;
n_pages -= n_too_much;
ret = -ENOSPC;
- }
- if (!pagemap_scan_push_range(categories, p, addr, *end)) {
*end = addr;
n_pages = 0;
ret = -ENOSPC;
- }
- p->found_pages += n_pages;
- if (ret)
p->end_addr = *end;
- return ret;
+}
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
+{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long categories;
- spinlock_t *ptl;
- int ret = 0;
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (!ptl)
return -ENOENT;
- categories = p->cur_vma_category | pagemap_thp_category(*pmd);
- ret = pagemap_scan_output(categories, p, start, &end);
- if (start == end)
goto out_unlock;
- if (~p->arg.flags & PM_SCAN_WP_MATCHING)
goto out_unlock;
- if (~categories & PAGE_IS_WRITTEN)
goto out_unlock;
- /*
* Break huge page into small pages if the WP operation
* need to be performed is on a portion of the huge page.
*/
- if (end != start + HPAGE_SIZE) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, start);
pagemap_scan_backout_range(p, start, end, 0);
pagemap_scan_backout_range looks "weird"... imho, it makes the code harder for understanding.
return -ENOENT;
I think you need to add a comment that this ENOENT is a special case.
- }
- make_uffd_wp_pmd(vma, start, pmd);
- flush_tlb_range(vma, start, end);
+out_unlock:
- spin_unlock(ptl);
- return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
- return -ENOENT;
+#endif +}
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte, *start_pte;
- unsigned long addr;
- bool flush = false;
- spinlock_t *ptl;
- int ret;
- arch_enter_lazy_mmu_mode();
- ret = pagemap_scan_thp_entry(pmd, start, end, walk);
- if (ret != -ENOENT) {
arch_leave_lazy_mmu_mode();
return ret;
- }
- start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- if (!pte) {
arch_leave_lazy_mmu_mode();
walk->action = ACTION_AGAIN;
return 0;
- }
- for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
unsigned long categories = p->cur_vma_category |
pagemap_page_category(vma, addr, ptep_get(pte));
unsigned long next = addr + PAGE_SIZE;
ret = pagemap_scan_output(categories, p, addr, &next);
if (next == addr) {
if (!ret)
continue;
break;
}
if (~p->arg.flags & PM_SCAN_WP_MATCHING)
continue;
if (~categories & PAGE_IS_WRITTEN)
continue;
make_uffd_wp_pte(vma, addr, pte);
if (!flush) {
start = addr;
flush = true;
}
- }
- if (flush)
flush_tlb_range(vma, start, addr);
- pte_unmap_unlock(start_pte, ptl);
- arch_leave_lazy_mmu_mode();
- cond_resched();
- return ret;
+}
+#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
unsigned long start, unsigned long end,
struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long categories;
- spinlock_t *ptl;
- int ret = 0;
- pte_t pte;
- if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
/* Go the short route when not write-protecting pages. */
pte = huge_ptep_get(ptep);
categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
return pagemap_scan_output(categories, p, start, &end);
- }
- i_mmap_lock_write(vma->vm_file->f_mapping);
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
- pte = huge_ptep_get(ptep);
- categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
- ret = pagemap_scan_output(categories, p, start, &end);
- if (start == end)
goto out_unlock;
- if (~categories & PAGE_IS_WRITTEN)
goto out_unlock;
- if (end != start + HPAGE_SIZE) {
/* Partial HugeTLB page WP isn't possible. */
pagemap_scan_backout_range(p, start, end, start);
ret = -EINVAL;
Why is it EINVAL in this case?
goto out_unlock;
- }
- make_uffd_wp_huge_pte(vma, start, ptep, pte);
- flush_hugetlb_tlb_range(vma, start, end);
+out_unlock:
- spin_unlock(ptl);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- return ret;
+} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
int depth, struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- int ret, err;
- if (!vma)
return 0;
- ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
- if (addr == end)
return ret;
- if (~p->arg.flags & PM_SCAN_WP_MATCHING)
return ret;
- err = uffd_wp_range(vma, addr, end - addr, true);
- if (err < 0)
ret = err;
- return ret;
+}
+static const struct mm_walk_ops pagemap_scan_ops = {
- .test_walk = pagemap_scan_test_walk,
- .pmd_entry = pagemap_scan_pmd_entry,
- .pte_hole = pagemap_scan_pte_hole,
- .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
unsigned long uarg)
+{
- if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
return -EFAULT;
- if (arg->size != sizeof(struct pm_scan_arg))
return -EINVAL;
- /* Validate requested features */
- if (arg->flags & ~PM_SCAN_FLAGS)
return -EINVAL;
- if ((arg->category_inverted | arg->category_mask |
arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
return -EINVAL;
- arg->start = untagged_addr((unsigned long)arg->start);
- arg->end = untagged_addr((unsigned long)arg->end);
- arg->vec = untagged_addr((unsigned long)arg->vec);
- /* Validate memory pointers */
- if (!IS_ALIGNED(arg->start, PAGE_SIZE))
return -EINVAL;
- if (!access_ok((void __user *)arg->start, arg->end - arg->start))
return -EFAULT;
- if (!arg->vec && arg->vec_len)
return -EFAULT;
- if (arg->vec && !access_ok((void __user *)arg->vec,
arg->vec_len * sizeof(struct page_region)))
return -EFAULT;
- /* Fixup default values */
- arg->end = ALIGN(arg->end, PAGE_SIZE);
- if (!arg->max_pages)
arg->max_pages = ULONG_MAX;
- return 0;
+}
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
unsigned long uargl)
+{
- struct pm_scan_arg __user *uarg = (void __user *)uargl;
- if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
return -EFAULT;
- return 0;
+}
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{
- if (!p->arg.vec_len) {
/*
* An arbitrary non-page-aligned sentinel value for
* pagemap_scan_push_range().
*/
p->cur_buf.start = p->cur_buf.end = ULLONG_MAX;
if (p->arg.vec)
p->vec_buf = ZERO_SIZE_PTR;
return 0;
- }
- /*
* Allocate a smaller buffer to get output from inside the page
* walk functions and walk the range in PAGEMAP_WALK_SIZE chunks.
* The last range is always stored in p.cur_buf to allow coalescing
* consecutive ranges that have the same categories returned across
* walk_page_range() calls.
*/
- p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
p->arg.vec_len - 1);
- p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
GFP_KERNEL);
- if (!p->vec_buf)
return -ENOMEM;
- p->vec_out = (struct page_region __user *)p->arg.vec;
- return 0;
+}
+static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{
- const struct page_region *buf = p->vec_buf;
- int n = (int)p->vec_buf_index;
- if (!n)
return 0;
- if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
return -EFAULT;
- p->arg.vec_len -= n;
- p->vec_out += n;
- p->vec_buf_index = 0;
- p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len - 1);
- return n;
+}
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{
- unsigned long walk_start, walk_end;
- struct mmu_notifier_range range;
- struct pagemap_scan_private p;
- size_t n_ranges_out = 0;
- int ret;
- memset(&p, 0, sizeof(p));
- ret = pagemap_scan_get_args(&p.arg, uarg);
- if (ret)
return ret;
- ret = pagemap_scan_init_bounce_buffer(&p);
- if (ret)
return ret;
- /* Protection change for the range is going to happen. */
- if (p.arg.flags & PM_SCAN_WP_MATCHING) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
mm, p.arg.start, p.arg.end);
mmu_notifier_invalidate_range_start(&range);
- }
- walk_start = walk_end = p.arg.start;
- for (; walk_end != p.arg.end; walk_start = walk_end) {
int n_out;
walk_end = min_t(unsigned long,
(walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK,
p.arg.end);
if (fatal_signal_pending(current)) { ret = EINTR; break; }
ret = mmap_read_lock_killable(mm);
if (ret)
break;
ret = walk_page_range(mm, walk_start, walk_end,
&pagemap_scan_ops, &p);
mmap_read_unlock(mm);
n_out = pagemap_scan_flush_buffer(&p);
if (n_out < 0)
ret = n_out;
else
n_ranges_out += n_out;
if (ret)
break;
- }
- if (p.cur_buf.start != p.cur_buf.end) {
if (copy_to_user(p.vec_out, &p.cur_buf, sizeof(p.cur_buf)))
ret = -EFAULT;
else
++n_ranges_out;
- }
- /* ENOSPC signifies early stop (buffer full) from the walk. */
- if (!ret || ret == -ENOSPC)
ret = n_ranges_out;
- p.arg.walk_end = p.end_addr ? p.end_addr : walk_start;
- if (pagemap_scan_writeback_args(&p.arg, uarg))
ret = -EFAULT;
- if (p.arg.flags & PM_SCAN_WP_MATCHING)
mmu_notifier_invalidate_range_end(&range);
- kfree(p.vec_buf);
- return ret;
+}
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
unsigned long arg)
+{
- struct mm_struct *mm = file->private_data;
- switch (cmd) {
- case PAGEMAP_SCAN:
return do_pagemap_scan(mm, arg);
- default:
return -EINVAL;
- }
+}
const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release,
- .unlocked_ioctl = do_pagemap_cmd,
- .compat_ioctl = do_pagemap_cmd,
}; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0a393bc02f25b..8f8ff07453f22 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -259,6 +259,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); +bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c5..1bb3c625c2381 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,62 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* Pagemap ioctl */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
+/* Bits are set in flags of the page_region and masks in pm_scan_args */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5)
+/*
- struct page_region - Page region with flags
- @start: Start of the region
- @end: End of the region (exclusive)
- @categories: PAGE_IS_* category bitmask for the region
- */
+struct page_region {
- __u64 start;
- __u64 end;
- __u64 categories;
+};
+/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
+/*
- struct pm_scan_arg - Pagemap ioctl argument
- @size: Size of the structure
- @flags: Flags for the IOCTL
- @start: Starting address of the region
- @end: Ending address of the region
- @walk_end: Ending address of the visited memory is returned
(This helps if entire range hasn't been visited)
- @vec: Address of page_region struct array for output
- @vec_len: Length of the page_region struct array
- @max_pages: Optional limit for number of returned pages (0 = disabled)
- @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
- @category_mask: Skip pages for which any category doesn't match
- @category_anyof_mask: Skip pages for which no category matches
- @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
- */
+struct pm_scan_arg {
- __u64 size;
- __u64 flags;
- __u64 start;
- __u64 end;
- __u64 walk_end;
- __u64 vec;
- __u64 vec_len;
- __u64 max_pages;
- __u64 category_inverted;
- __u64 category_mask;
- __u64 category_anyof_mask;
- __u64 return_mask;
+};
#endif /* _UAPI_LINUX_FS_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a073e6ed8900b..3b07db0a4f2d9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5008,7 +5008,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; -- 2.39.2