On Tue, 6 Jun 2023 at 08:08, Muhammad Usama Anjum usama.anjum@collabora.com wrote:
This IOCTL, PAGEMAP_SCAN on pagemap file can be used to get and/or clear the info about page table entries. The following operations are supported in this ioctl:
- Get the information if the pages have been written-to (PAGE_IS_WRITTEN), file mapped (PAGE_IS_FILE), present (PAGE_IS_PRESENT) or swapped (PAGE_IS_SWAPPED).
- Find pages which have been written-to and/or write protect the pages (atomic PM_SCAN_OP_GET + PM_SCAN_OP_WP)
This IOCTL can be extended to get information about more PTE bits.
[...]
--- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c
[...]
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool is_pmd_uffd_wp(pmd_t pmd) +{
return (pmd_present(pmd) && pmd_uffd_wp(pmd)) ||(is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd));+}
[...]
+#ifdef CONFIG_HUGETLB_PAGE +static inline bool is_huge_pte_uffd_wp(pte_t pte) +{
return ((pte_present(pte) && huge_pte_uffd_wp(pte)) ||pte_swp_uffd_wp_any(pte));
Nit: please remove the outer parentheses (it is already done for similar finctuons above).
+}
+static inline bool pagemap_scan_check_page_written(struct pagemap_scan_private *p) +{
return (p->required_mask | p->anyof_mask | p->excluded_mask) &PAGE_IS_WRITTEN;+}
This could be precalculated and put as a flag into pagemap_scan_private - it is kernel-private structure and there are a few spare bits in `flags` if you'd prefer not to add an explicit boolean.
[...]
+static int pagemap_scan_output(bool wt, bool file, bool pres, bool swap,
struct pagemap_scan_private *p,unsigned long addr, unsigned int n_pages)+{
unsigned long bitmap = PM_SCAN_BITMAP(wt, file, pres, swap);struct page_region *cur = &p->cur;if (!n_pages)return -EINVAL;if ((p->required_mask & bitmap) != p->required_mask)return 0;if (p->anyof_mask && !(p->anyof_mask & bitmap))return 0;if (p->excluded_mask & bitmap)return 0;bitmap &= p->return_mask;if (!bitmap)return 0;if (cur->bitmap == bitmap &&cur->start + cur->len * PAGE_SIZE == addr) {cur->len += n_pages;p->found_pages += n_pages;} else {/** All data is copied to cur first. When more data is found, we* push cur to vec and copy new data to cur. The vec_index* represents the current index of vec array. We add 1 to the* vec_index while performing checks to account for data in cur.*/if (cur->len && (p->vec_index + 1) >= p->vec_len)return -ENOSPC;if (cur->len) {memcpy(&p->vec[p->vec_index], cur, sizeof(*p->vec));p->vec_index++;}cur->start = addr;cur->len = n_pages;cur->bitmap = bitmap;p->found_pages += n_pages;}if (p->max_pages && (p->found_pages == p->max_pages))return PM_SCAN_FOUND_MAX_PAGES;return 0;+}
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)+{
struct pagemap_scan_private *p = walk->private;struct vm_area_struct *vma = walk->vma;unsigned long addr = end;pte_t *pte, *orig_pte;spinlock_t *ptl;bool is_written;int ret = 0;arch_enter_lazy_mmu_mode();+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmd, vma);if (ptl) {unsigned long n_pages = (end - start)/PAGE_SIZE;if (p->max_pages && n_pages > p->max_pages - p->found_pages)n_pages = p->max_pages - p->found_pages;
Since p->found_pages is only ever increased in `pagemap_scan_output()` and that function is only called for GET or GET+WP operations, maybe the logic could be folded to pagemap_scan_output() to avoid duplication? In this function the calculation is used only when WP op is done to split the HP if n_pages limit would be hit, but if using plain WP (without GET) it doesn't make sense to use the limit. (pagemap_scan_output() is trivial enough so I think it could be pulled inside the spinlocked region.)
is_written = !is_pmd_uffd_wp(*pmd);/** Break huge page into small pages if the WP operation need to* be performed is on a portion of the huge page.*/if (is_written && IS_PM_SCAN_WP(p->flags) &&n_pages < HPAGE_SIZE/PAGE_SIZE) {spin_unlock(ptl);split_huge_pmd(vma, pmd, start);goto process_smaller_pages;}if (IS_PM_SCAN_GET(p->flags))ret = pagemap_scan_output(is_written, vma->vm_file,pmd_present(*pmd),is_swap_pmd(*pmd),p, start, n_pages);if (ret >= 0 && is_written && IS_PM_SCAN_WP(p->flags))make_uffd_wp_pmd(vma, addr, pmd);if (IS_PM_SCAN_WP(p->flags))
Why `is_written` is not checked? If is_written is false, then the WP op should be a no-op and so won't need TLB flushing, will it? [Same for the PTE case below.]
flush_tlb_range(vma, start, end);
[...]
if (IS_PM_SCAN_WP(p->flags))flush_tlb_range(vma, start, addr);pte_unmap_unlock(orig_pte, ptl);arch_leave_lazy_mmu_mode();cond_resched();return ret;+}
+#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
unsigned long start, unsigned long end,struct mm_walk *walk)+{
unsigned long n_pages = (end - start)/PAGE_SIZE;struct pagemap_scan_private *p = walk->private;struct vm_area_struct *vma = walk->vma;struct hstate *h = hstate_vma(vma);spinlock_t *ptl;bool is_written;int ret = 0;pte_t pte;if (p->max_pages && n_pages > p->max_pages - p->found_pages)n_pages = p->max_pages - p->found_pages;if (IS_PM_SCAN_WP(p->flags)) {i_mmap_lock_write(vma->vm_file->f_mapping);ptl = huge_pte_lock(h, vma->vm_mm, ptep);}pte = huge_ptep_get(ptep);is_written = !is_huge_pte_uffd_wp(pte);/** Partial hugetlb page clear isn't supported*/if (is_written && IS_PM_SCAN_WP(p->flags) &&n_pages < HPAGE_SIZE/PAGE_SIZE) {ret = -EPERM;
Shouldn't this be ENOSPC, conveying that the operation would overflow the n_pages limit?
goto unlock_and_return;}if (IS_PM_SCAN_GET(p->flags)) {ret = pagemap_scan_output(is_written, vma->vm_file,pte_present(pte), is_swap_pte(pte),p, start, n_pages);if (ret < 0)goto unlock_and_return;}if (is_written && IS_PM_SCAN_WP(p->flags)) {
Oh, this case does check `is_written` before flushing TLB, contrary to what the cases above do.
make_uffd_wp_huge_pte(vma, start, ptep, pte);flush_hugetlb_tlb_range(vma, start, end);}+unlock_and_return:
if (IS_PM_SCAN_WP(p->flags)) {spin_unlock(ptl);i_mmap_unlock_write(vma->vm_file->f_mapping);}return ret;+} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
int depth, struct mm_walk *walk)+{
unsigned long n_pages = (end - addr)/PAGE_SIZE;struct pagemap_scan_private *p = walk->private;struct vm_area_struct *vma = walk->vma;int ret = 0;if (!vma || !IS_PM_SCAN_GET(p->flags))return 0;if (p->max_pages && n_pages > p->max_pages - p->found_pages)n_pages = p->max_pages - p->found_pages;
Nit: If the page flags don't match (wouldn't be output), the limit would not be hit and the calculation is unnecessary. But if it was done in pagemap_scan_output() instead after all the flags checks...
ret = pagemap_scan_output(false, vma->vm_file, false, false, p, addr,n_pages);return ret;+}
[...]
+static long do_pagemap_scan(struct mm_struct *mm,
struct pm_scan_arg __user *uarg)+{
unsigned long start, end, walk_start, walk_end;unsigned long empty_slots, vec_index = 0;struct mmu_notifier_range range;struct page_region __user *vec;struct pagemap_scan_private p;struct pm_scan_arg arg;int ret = 0;if (copy_from_user(&arg, uarg, sizeof(arg)))return -EFAULT;start = untagged_addr((unsigned long)arg.start);vec = (struct page_region *)untagged_addr((unsigned long)arg.vec);ret = pagemap_scan_args_valid(&arg, start, vec);if (ret)return ret;end = start + arg.len;p.max_pages = arg.max_pages;p.found_pages = 0;p.flags = arg.flags;p.required_mask = arg.required_mask;p.anyof_mask = arg.anyof_mask;p.excluded_mask = arg.excluded_mask;p.return_mask = arg.return_mask;p.cur.start = p.cur.len = p.cur.bitmap = 0;p.vec = NULL;p.vec_len = PAGEMAP_WALK_SIZE >> PAGE_SHIFT;
If p.vec_len would not count the entry held in `cur` (IOW: vec_len = WALK_SIZE - 1), then pagemap_scan_output() wouldn't need the big comment about adding or subtracting 1 when checking for overflow. The output vector needs to have space for at least one entrry to make GET useful. Maybe `cur` could be renamed or annotated to express that it always holds the last entry?
/** Allocate smaller buffer to get output from inside the page walk* functions and walk page range in PAGEMAP_WALK_SIZE size chunks. As* we want to return output to user in compact form where no two* consecutive regions should be continuous and have the same flags.* So store the latest element in p.cur between different walks and* store the p.cur at the end of the walk to the user buffer.*/if (IS_PM_SCAN_GET(p.flags)) {p.vec = kmalloc_array(p.vec_len, sizeof(*p.vec), GFP_KERNEL);if (!p.vec)return -ENOMEM;}if (IS_PM_SCAN_WP(p.flags)) {mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,mm, start, end);mmu_notifier_invalidate_range_start(&range);}walk_start = walk_end = start;while (walk_end < end && !ret) {if (IS_PM_SCAN_GET(p.flags)) {p.vec_index = 0;empty_slots = arg.vec_len - vec_index;
Can `empty_slots` be zero here? I don't see anything prohibiting this case.
p.vec_len = min(p.vec_len, empty_slots);
( If not counting `cur`, it would be min(p.vec_len, empty_slots - 1); )
}walk_end = (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;if (walk_end > end)walk_end = end;ret = mmap_read_lock_killable(mm);if (ret)goto free_data;ret = walk_page_range(mm, walk_start, walk_end,&pagemap_scan_ops, &p);mmap_read_unlock(mm);if (ret && ret != -ENOSPC && ret != PM_SCAN_FOUND_MAX_PAGES)goto free_data;walk_start = walk_end;if (IS_PM_SCAN_GET(p.flags) && p.vec_index) {if (copy_to_user(&vec[vec_index], p.vec,p.vec_index * sizeof(*p.vec))) {/** Return error even though the OP succeeded*/ret = -EFAULT;goto free_data;}vec_index += p.vec_index;}}if (IS_PM_SCAN_GET(p.flags) && p.cur.len) {
Nit: p.cur.len can be non-zero only if we do a GET (or GET+WP) operation.
if (copy_to_user(&vec[vec_index], &p.cur, sizeof(*p.vec))) {
Nit: sizeof(*p.cur); (even though this is the same type)
ret = -EFAULT;goto free_data;}vec_index++;}ret = vec_index;+free_data:
if (IS_PM_SCAN_WP(p.flags))mmu_notifier_invalidate_range_end(&range);kfree(p.vec);return ret;+}
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
unsigned long arg)+{
struct pm_scan_arg __user *uarg = (struct pm_scan_arg __user *)arg;
The cast should be in do_pagemap_scan() as if there comes another `cmd`, then it might use a different argument type.
struct mm_struct *mm = file->private_data;switch (cmd) {case PAGEMAP_SCAN:return do_pagemap_scan(mm, uarg);default:return -EINVAL;}+}
Best Regards Michał Mirosław