Hello Andrei,
Thank you for reviewing.
On 11/10/22 4:54 AM, Andrei Vagin wrote: [...]
+static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p,
unsigned long addr, unsigned int len)
+{
- unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3;
Should we define contants for each of these bits?
I think I can define a macro to hide this dirty bit shifting in the function.
- bool cpy = true;
- if (p->required_mask)
cpy = ((p->required_mask & cur) == p->required_mask);
- if (cpy && p->anyof_mask)
cpy = (p->anyof_mask & cur);
- if (cpy && p->excluded_mask)
cpy = !(p->excluded_mask & cur);
- bitmap = cur & p->return_mask;
- if (cpy && bitmap) {
if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) &&
(p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE ==
addr)) {
I think it is better to define a variable for p->vec_index - 1.
Will do in the next revision.
nit: len can be in bytes rather than pages.
We are considering memory in the page units. The memory given to this IOCTL must have PAGE_SIZE alignment. Oterwise we error out (picked this from mincore()).
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned int len;
- spinlock_t *ptl;
- int ret = 0;
- pte_t *pte;
- bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ?
(false) : (vma->vm_flags & VM_SOFTDIRTY);
- if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages))
return 0;
- end = min(end, walk->vma->vm_end);
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
/*
* Break huge page into small pages if operation needs to be performed is
* on a portion of the huge page or the return buffer cannot store complete
* data.
*/
if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, addr);
goto process_smaller_pages;
}
if (IS_GET_OP(p)) {
len = (end - addr)/PAGE_SIZE;
if (p->max_pages && p->found_pages + len > p->max_pages)
len = p->max_pages - p->found_pages;
ret = add_to_out(dirty_vma ||
check_soft_dirty_pmd(vma, addr, pmd, false),
can we reuse a return code of the previous call of check_soft_dirty_pmd?
Yes, will do.
vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd),
p, addr, len);
}
if (!ret && IS_CLEAR_OP(p))
check_soft_dirty_pmd(vma, addr, pmd, true);
should we return a error in this case? We need to be sure that:
- we stop waking page tables after this point.
I'll update the implementation to return error. It immediately terminates the walk as well.
- return this error to the user-space if we are not able to add anything in the vector.
I'm not returning error to userspace if we found no page matching the masks. The total number of filled page_region are returned from the IOCTL. If IOCTL returns 0, it means no page found has found, but the IOCTL executed successfully.
[...]
+static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg) +{
- struct mmu_notifier_range range;
- unsigned long __user start, end;
- struct pagemap_scan_private p;
- int ret;
- start = (unsigned long)untagged_addr(arg->start);
- if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len)))
return -EINVAL;
- if (IS_GET_OP(arg) &&
((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len))))
return -ENOMEM;
+#ifndef CONFIG_MEM_SOFT_DIRTY
- if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) ||
(arg->anyof_mask & PAGE_IS_SOFTDIRTY))
return -EINVAL;
+#endif
- if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) ||
(arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) ||
(arg->return_mask & ~PAGEMAP_OP_MASK))
return -EINVAL;
- if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask)
return -EINVAL;
- if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) ||
(arg->anyof_mask & PAGEMAP_NONSD_OP_MASK)))
return -EINVAL;
- end = start + arg->len;
- p.max_pages = arg->max_pages;
- p.found_pages = 0;
- p.flags = arg->flags;
- p.required_mask = arg->required_mask;
- p.anyof_mask = arg->anyof_mask;
- p.excluded_mask = arg->excluded_mask;
- p.return_mask = arg->return_mask;
- p.vec_index = 0;
- p.vec_len = arg->vec_len;
- if (IS_GET_OP(arg)) {
p.vec = vzalloc(arg->vec_len * sizeof(struct page_region));
I think we need to set a reasonable limit for vec_len to avoid large allocations on the kernel. We can consider to use kmalloc or kvmalloc here.
I'll update to kvzalloc which uses vmalloc if kmalloc fails. It'll use kmalloc for smaller allocations. Thanks for suggesting it. But it'll not limit the memory allocation.
Thanks, Andrei