Hi Muhammad,
Here are a few inline comments.
On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote:
This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about page table entries. The following operations are supported in this ioctl:
- Get the information if the pages are soft-dirty, file mapped, present or swapped.
- Clear the soft-dirty PTE bit of the pages.
- Get and clear the soft-dirty PTE bit of the pages.
Only the soft-dirty bit can be read and cleared atomically. struct pagemap_sd_args is used as the argument of the IOCTL. In this struct:
- The range is specified through start and len.
- The output buffer and size is specified as vec and vec_len.
- The optional maximum requested pages are specified in the max_pages.
- The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR and PAGEMAP_SD_NO_REUSED_REGIONS are supported.
- The masks are specified in rmask, amask, emask and return_mask.
This IOCTL can be extended to get information about more PTE bits.
This is based on a patch from Gabriel Krisman Bertazi.
Signed-off-by: Muhammad Usama Anjum usama.anjum@collabora.com
Changes in v6:
- Rename variables and update comments
- Make IOCTL independent of soft_dirty config
- Change masks and bitmap type to _u64
- Improve code quality
Changes in v5:
- Remove tlb flushing even for clear operation
Changes in v4:
- Update the interface and implementation
Changes in v3:
- Tighten the user-kernel interface by using explicit types and add more error checking
Changes in v2:
- Convert the interface from syscall to ioctl
- Remove pidfd support as it doesn't make sense in ioctl
fs/proc/task_mmu.c | 328 ++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 56 ++++++ tools/include/uapi/linux/fs.h | 56 ++++++ 3 files changed, 440 insertions(+)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8235c536ac70..8d6a84ec5ef7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,9 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <uapi/linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/minmax.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -1775,11 +1778,336 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PAGEMAP_OP_MASK (PAGE_IS_SOFTDIRTY | PAGE_IS_FILE | \
PAGE_IS_PRESENT | PAGE_IS_SWAPPED)
+#define PAGEMAP_NONSD_OP_MASK (PAGE_IS_FILE | PAGE_IS_PRESENT | PAGE_IS_SWAPPED) +#define PAGEMAP_SD_FLAGS (PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS) +#define IS_CLEAR_OP(a) (a->flags & PAGEMAP_SOFTDIRTY_CLEAR) +#define IS_GET_OP(a) (a->vec) +#define IS_SD_OP(a) (a->flags & PAGEMAP_SD_FLAGS)
+struct pagemap_scan_private {
- struct page_region *vec;
- unsigned long vec_len;
- unsigned long vec_index;
- unsigned int max_pages;
- unsigned int found_pages;
- unsigned int flags;
- unsigned long required_mask;
- unsigned long anyof_mask;
- unsigned long excluded_mask;
- unsigned long return_mask;
+};
+static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) +{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages))
return -1;
- if (vma->vm_flags & VM_PFNMAP)
return 1;
- return 0;
+}
+static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
unsigned long addr, unsigned int len)
+{
- unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
Should we define contants for each of these bits?
- bool cpy = true;
- if (p->required_mask)
cpy = ((p->required_mask & cur) == p->required_mask);
- if (cpy && p->anyof_mask)
cpy = (p->anyof_mask & cur);
- if (cpy && p->excluded_mask)
cpy = !(p->excluded_mask & cur);
- bitmap = cur & p->return_mask;
- if (cpy && bitmap) {
if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
(p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
addr)) {
I think it is better to define a variable for p->vec_index - 1. nit: len can be in bytes rather than pages.
p->vec[p->vec_index - 1].len += len;
p->found_pages += len;
} else if (p->vec_index < p->vec_len) {
p->vec[p->vec_index].start = addr;
p->vec[p->vec_index].len = len;
p->found_pages += len;
p->vec[p->vec_index].bitmap = bitmap;
p->vec_index++;
} else {
return -ENOMEM;
}
- }
- return 0;
+}
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned int len;
- spinlock_t *ptl;
- int ret = 0;
- pte_t *pte;
- bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
(false) : (vma->vm_flags & VM_SOFTDIRTY);
- if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
return 0;
- end = min(end, walk->vma->vm_end);
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
/*
* Break huge page into small pages if operation needs to be performed is
* on a portion of the huge page or the return buffer cannot store complete
* data.
*/
if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, addr);
goto process_smaller_pages;
}
if (IS_GET_OP(p)) {
len = (end - addr)/PAGE_SIZE;
if (p->max_pages && p->found_pages + len > p->max_pages)
len = p->max_pages - p->found_pages;
ret = add_to_out(dirty_vma ||
check_soft_dirty_pmd(vma, addr, pmd, false),
can we reuse a return code of the previous call of check_soft_dirty_pmd?
vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
p, addr, len);
}
if (!ret && IS_CLEAR_OP(p))
check_soft_dirty_pmd(vma, addr, pmd, true);
should we return a error in this case? We need to be sure that: * we stop waking page tables after this point. * return this error to the user-space if we are not able to add anything in the vector.
}
spin_unlock(ptl);
return 0;
- }
+process_smaller_pages:
- if (pmd_trans_unstable(pmd))
return 0;
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages))
; pte++, addr += PAGE_SIZE) {
if (IS_GET_OP(p))
ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false),
vma->vm_file, pte_present(*pte),
is_swap_pte(*pte), p, addr, 1);
if (!ret && IS_CLEAR_OP(p))
check_soft_dirty(vma, addr, pte, true);
- }
- pte_unmap_unlock(pte - 1, ptl);
- cond_resched();
- return 0;
+}
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth,
struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned int len;
- bool sd;
- if (vma) {
/* Individual pages haven't been allocated and written */
sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) :
(vma->vm_flags & VM_SOFTDIRTY);
len = (end - addr)/PAGE_SIZE;
if (p->max_pages && p->found_pages + len > p->max_pages)
len = p->max_pages - p->found_pages;
add_to_out(sd, vma->vm_file, false, false, p, addr, len);
- }
- return 0;
+}
+#ifdef CONFIG_MEM_SOFT_DIRTY +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk) +{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long end_cut = end;
- int ret;
- if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
(vma->vm_flags & VM_SOFTDIRTY)) {
if (vma->vm_start < start) {
ret = split_vma(vma->vm_mm, vma, start, 1);
if (ret)
return ret;
}
/* Calculate end_cut because of max_pages */
if (IS_GET_OP(p) && p->max_pages)
end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end);
if (vma->vm_end > end_cut) {
ret = split_vma(vma->vm_mm, vma, end_cut, 0);
if (ret)
return ret;
}
- }
- return 0;
+}
+static void pagemap_scan_post_vma(struct mm_walk *walk) +{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) &&
(vma->vm_flags & VM_SOFTDIRTY)) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
- }
+} +#endif /* CONFIG_MEM_SOFT_DIRTY */
+static const struct mm_walk_ops pagemap_scan_ops = {
- .test_walk = pagemap_scan_pmd_test_walk,
- .pmd_entry = pagemap_scan_pmd_entry,
- .pte_hole = pagemap_scan_pte_hole,
+#ifdef CONFIG_MEM_SOFT_DIRTY
- /* Only for clearing SD bit over VMAs */
- .pre_vma = pagemap_scan_pre_vma,
- .post_vma = pagemap_scan_post_vma,
+#endif /* CONFIG_MEM_SOFT_DIRTY */ +};
+static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg) +{
- struct mmu_notifier_range range;
- unsigned long __user start, end;
- struct pagemap_scan_private p;
- int ret;
- start = (unsigned long)untagged_addr(arg->start);
- if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
return -EINVAL;
- if (IS_GET_OP(arg) &&
((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
return -ENOMEM;
+#ifndef CONFIG_MEM_SOFT_DIRTY
- if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
(arg->anyof_mask & PAGE_IS_SOFTDIRTY))
return -EINVAL;
+#endif
- if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
(arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
(arg->return_mask & ~PAGEMAP_OP_MASK))
return -EINVAL;
- if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
return -EINVAL;
- if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
(arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
return -EINVAL;
- end = start + arg->len;
- p.max_pages = arg->max_pages;
- p.found_pages = 0;
- p.flags = arg->flags;
- p.required_mask = arg->required_mask;
- p.anyof_mask = arg->anyof_mask;
- p.excluded_mask = arg->excluded_mask;
- p.return_mask = arg->return_mask;
- p.vec_index = 0;
- p.vec_len = arg->vec_len;
- if (IS_GET_OP(arg)) {
p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
I think we need to set a reasonable limit for vec_len to avoid large allocations on the kernel. We can consider to use kmalloc or kvmalloc here.
Thanks, Andrei