Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..4bc45d3ed8b0 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { - unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE; + unsigned long vdso_base = (unsigned long)mm->context.vdso;
if (start <= vdso_base && vdso_base < end) mm->context.vdso = NULL; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e839a906fdf2..b14907209822 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc { unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
- if (new_size != text_size + PAGE_SIZE) + if (new_size != text_size) return -EINVAL;
- current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE; + current->mm->context.vdso = (void __user *)new_vma->vm_start;
return 0; } @@ -73,6 +73,10 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); }
+static struct vm_special_mapping vvar_spec __ro_after_init = { + .name = "[vvar]", +}; + static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, @@ -89,11 +93,11 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { */ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - struct mm_struct *mm = current->mm; + unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec; + unsigned long vvar_size = PAGE_SIZE; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_size; - unsigned long vdso_base;
if (is_32bit_task()) { vdso_spec = &vdso32_spec; @@ -110,8 +114,8 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vdso_base = 0; }
- /* Add a page to the vdso size for the data page */ - vdso_size += PAGE_SIZE; + mappings_size = vdso_size + vvar_size; + mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
/* * pick a base address for the vDSO in process space. We try to put it @@ -119,9 +123,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * and end up putting it elsewhere. * Add enough to the size so that the result can be aligned. */ - vdso_base = get_unmapped_area(NULL, vdso_base, - vdso_size + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), - 0, 0); + vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0); if (IS_ERR_VALUE(vdso_base)) return vdso_base;
@@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */ - mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE; + mm->context.vdso = (void __user *)vdso_base + vvar_size; + + vma = _install_special_mapping(mm, vdso_base, vvar_size, + VM_READ | VM_MAYREAD | VM_IO | + VM_DONTDUMP | VM_PFNMAP, &vvar_spec); + if (IS_ERR(vma)) + return PTR_ERR(vma);
/* * our vma flags don't have VM_WRITE so by default, the process isn't @@ -145,9 +153,12 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * It's fine to use that for setting breakpoints in the vDSO code * pages though. */ - vma = _install_special_mapping(mm, vdso_base, vdso_size, + vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, vdso_spec); + if (IS_ERR(vma)) + do_munmap(mm, vdso_base, vvar_size, NULL); + return PTR_ERR_OR_ZERO(vma); }
@@ -249,11 +260,22 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) if (!pagelist) panic("%s: Cannot allocate page list for VDSO", __func__);
- pagelist[0] = virt_to_page(vdso_data); - for (i = 0; i < pages; i++) - pagelist[i + 1] = virt_to_page(start + i * PAGE_SIZE); + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + + return pagelist; +} + +static struct page ** __init vvar_setup_pages(void) +{ + struct page **pagelist;
+ /* .pages is NULL-terminated */ + pagelist = kcalloc(2, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VVAR", __func__); + + pagelist[0] = virt_to_page(vdso_data); return pagelist; }
@@ -295,6 +317,8 @@ static int __init vdso_init(void) if (IS_ENABLED(CONFIG_PPC64)) vdso64_spec.pages = vdso_setup_pages(&vdso64_start, &vdso64_end);
+ vvar_spec.pages = vvar_setup_pages(); + smp_wmb();
return 0;
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain.
Why is that a workaround, and why for one release only ? I think the solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should work with any past and future release.
I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
I tested it with sifreturn_vdso selftest and it worked, because that selftest doesn't involve VDSO data.
But if I do a mremap() on the VDSO text vma without remapping VVAR to keep the same distance between the two vmas, gettimeofday() crashes. The reason is that the code obtains the address of the data by calculating a fix difference from its own address with the below macro, the delta being resolved at link time:
.macro get_datapage ptr bcl 20, 31, .+4 999: mflr \ptr #if CONFIG_PPC_PAGE_SHIFT > 14 addis \ptr, \ptr, (_vdso_datapage - 999b)@ha #endif addi \ptr, \ptr, (_vdso_datapage - 999b)@l .endm
So the datapage needs to remain at the same distance from the code at all time.
Wondering how the other architectures do to have two independant VMAs and be able to move one independantly of the other.
Christophe
Hi Christophe,
On 3/27/21 5:19 PM, Christophe Leroy wrote: [..]
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain.
Why is that a workaround, and why for one release only ? I think the solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should work with any past and future release.
Yeah, I guess. Previously, (before v5.11/power) all kernels had ELF start at "[vdso]" VMA start, now we'll have to carry the offset in the VMA. Probably, not the worst thing, but as it will be only for v5.11 release it can break, so needs separate testing. Kinda life was a bit easier without this additional code.
I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
I tested it with sifreturn_vdso selftest and it worked, because that selftest doesn't involve VDSO data.
Thanks again on helping with testing it, I appreciate it!
But if I do a mremap() on the VDSO text vma without remapping VVAR to keep the same distance between the two vmas, gettimeofday() crashes. The reason is that the code obtains the address of the data by calculating a fix difference from its own address with the below macro, the delta being resolved at link time:
.macro get_datapage ptr bcl 20, 31, .+4 999: mflr \ptr #if CONFIG_PPC_PAGE_SHIFT > 14 addis \ptr, \ptr, (_vdso_datapage - 999b)@ha #endif addi \ptr, \ptr, (_vdso_datapage - 999b)@l .endm
So the datapage needs to remain at the same distance from the code at all time.
Wondering how the other architectures do to have two independent VMAs and be able to move one independently of the other.
It's alright as far as I know. If userspace remaps vdso/vvar it should be aware of this (CRIU keeps this in mind, also old vdso image is dumped to compare on restore with the one that the host has).
Thanks, Dmitry
Hi Christophe and Dimitry,
Le 27/03/2021 à 18:43, Dmitry Safonov a écrit :
Hi Christophe,
On 3/27/21 5:19 PM, Christophe Leroy wrote: [..]
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain.
Why is that a workaround, and why for one release only ? I think the solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should work with any past and future release.
Yeah, I guess. Previously, (before v5.11/power) all kernels had ELF start at "[vdso]" VMA start, now we'll have to carry the offset in the VMA. Probably, not the worst thing, but as it will be only for v5.11 release it can break, so needs separate testing. Kinda life was a bit easier without this additional code.
The assumption that ELF header is at the start of "[vdso]" is perhaps not a good one, but using a "[vvar]" section looks more conventional and allows to clearly identify the data part. I'd argue for this option.
I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
I tested it with sifreturn_vdso selftest and it worked, because that selftest doesn't involve VDSO data.
Thanks again on helping with testing it, I appreciate it!
But if I do a mremap() on the VDSO text vma without remapping VVAR to keep the same distance between the two vmas, gettimeofday() crashes. The reason is that the code obtains the address of the data by calculating a fix difference from its own address with the below macro, the delta being resolved at link time:
.macro get_datapage ptr bcl 20, 31, .+4 999: mflr \ptr #if CONFIG_PPC_PAGE_SHIFT > 14 addis \ptr, \ptr, (_vdso_datapage - 999b)@ha #endif addi \ptr, \ptr, (_vdso_datapage - 999b)@l .endm
So the datapage needs to remain at the same distance from the code at all time.
Wondering how the other architectures do to have two independent VMAs and be able to move one independently of the other.
It's alright as far as I know. If userspace remaps vdso/vvar it should be aware of this (CRIU keeps this in mind, also old vdso image is dumped to compare on restore with the one that the host has).
I do agree, playing with the VDSO mapping needs the application to be aware of the mapping details, and prior to 83d3f0e90c6c "powerpc/mm: tracking vDSO remap", remapping the VDSO was not working on PowerPC and nobody complained...
Laurent.
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
I run the CRIU's test suite and except the usual suspects, all the tests passed.
Tested-by: Laurent Dufour ldufour@linux.ibm.com
arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..4bc45d3ed8b0 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) {
- unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE;
- unsigned long vdso_base = (unsigned long)mm->context.vdso;
if (start <= vdso_base && vdso_base < end) mm->context.vdso = NULL; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e839a906fdf2..b14907209822 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc { unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
- if (new_size != text_size + PAGE_SIZE)
- if (new_size != text_size) return -EINVAL;
- current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE;
- current->mm->context.vdso = (void __user *)new_vma->vm_start;
return 0; } @@ -73,6 +73,10 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); } +static struct vm_special_mapping vvar_spec __ro_after_init = {
- .name = "[vvar]",
+};
- static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap,
@@ -89,11 +93,11 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { */ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) {
- struct mm_struct *mm = current->mm;
- unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec;
- unsigned long vvar_size = PAGE_SIZE;
- struct mm_struct *mm = current->mm; struct vm_area_struct *vma;
- unsigned long vdso_size;
- unsigned long vdso_base;
if (is_32bit_task()) { vdso_spec = &vdso32_spec; @@ -110,8 +114,8 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vdso_base = 0; }
- /* Add a page to the vdso size for the data page */
- vdso_size += PAGE_SIZE;
- mappings_size = vdso_size + vvar_size;
- mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
/* * pick a base address for the vDSO in process space. We try to put it @@ -119,9 +123,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * and end up putting it elsewhere. * Add enough to the size so that the result can be aligned. */
- vdso_base = get_unmapped_area(NULL, vdso_base,
vdso_size + ((VDSO_ALIGNMENT - 1) & PAGE_MASK),
0, 0);
- vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0); if (IS_ERR_VALUE(vdso_base)) return vdso_base;
@@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */
- mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE;
- mm->context.vdso = (void __user *)vdso_base + vvar_size;
- vma = _install_special_mapping(mm, vdso_base, vvar_size,
VM_READ | VM_MAYREAD | VM_IO |
VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
- if (IS_ERR(vma))
return PTR_ERR(vma);
/* * our vma flags don't have VM_WRITE so by default, the process isn't @@ -145,9 +153,12 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * It's fine to use that for setting breakpoints in the vDSO code * pages though. */
- vma = _install_special_mapping(mm, vdso_base, vdso_size,
- vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, vdso_spec);
- if (IS_ERR(vma))
do_munmap(mm, vdso_base, vvar_size, NULL);
- return PTR_ERR_OR_ZERO(vma); }
@@ -249,11 +260,22 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) if (!pagelist) panic("%s: Cannot allocate page list for VDSO", __func__);
- pagelist[0] = virt_to_page(vdso_data);
- for (i = 0; i < pages; i++)
pagelist[i + 1] = virt_to_page(start + i * PAGE_SIZE);
pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
- return pagelist;
+}
+static struct page ** __init vvar_setup_pages(void) +{
- struct page **pagelist;
- /* .pages is NULL-terminated */
- pagelist = kcalloc(2, sizeof(struct page *), GFP_KERNEL);
- if (!pagelist)
panic("%s: Cannot allocate page list for VVAR", __func__);
- pagelist[0] = virt_to_page(vdso_data); return pagelist; }
@@ -295,6 +317,8 @@ static int __init vdso_init(void) if (IS_ENABLED(CONFIG_PPC64)) vdso64_spec.pages = vdso_setup_pages(&vdso64_start, &vdso64_end);
- vvar_spec.pages = vvar_setup_pages();
- smp_wmb();
return 0;
On 3/29/21 4:14 PM, Laurent Dufour wrote:
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
I run the CRIU's test suite and except the usual suspects, all the tests passed.
Tested-by: Laurent Dufour ldufour@linux.ibm.com
Thank you, Laurent!
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-)
@@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */
- mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE;
- mm->context.vdso = (void __user *)vdso_base + vvar_size;
- vma = _install_special_mapping(mm, vdso_base, vvar_size,
VM_READ | VM_MAYREAD | VM_IO |
VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
- if (IS_ERR(vma))
return PTR_ERR(vma);
/* * our vma flags don't have VM_WRITE so by default, the process isn't
IIUC, VM_PFNMAP is for when we have a vvar_fault handler. Allthough we will soon have one for handle TIME_NS, at the moment powerpc doesn't have that handler. Isn't it dangerous to set VM_PFNMAP then ?
Christophe
Christophe Leroy christophe.leroy@csgroup.eu writes:
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-)
@@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */
- mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE;
- mm->context.vdso = (void __user *)vdso_base + vvar_size;
- vma = _install_special_mapping(mm, vdso_base, vvar_size,
VM_READ | VM_MAYREAD | VM_IO |
VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
- if (IS_ERR(vma))
return PTR_ERR(vma);
/* * our vma flags don't have VM_WRITE so by default, the process isn't
IIUC, VM_PFNMAP is for when we have a vvar_fault handler.
Some of the other flags seem odd too. eg. VM_IO ? VM_DONTDUMP ?
cheers
On 3/31/21 10:59 AM, Michael Ellerman wrote:
Christophe Leroy christophe.leroy@csgroup.eu writes:
[..]
@@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */
- mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE;
- mm->context.vdso = (void __user *)vdso_base + vvar_size;
- vma = _install_special_mapping(mm, vdso_base, vvar_size,
VM_READ | VM_MAYREAD | VM_IO |
VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
- if (IS_ERR(vma))
return PTR_ERR(vma);
/* * our vma flags don't have VM_WRITE so by default, the process isn't
IIUC, VM_PFNMAP is for when we have a vvar_fault handler. Allthough we will soon have one for handle TIME_NS, at the moment powerpc doesn't have that handler. Isn't it dangerous to set VM_PFNMAP then ?
I believe, it's fine, special_mapping_fault() does: : if (sm->fault) : return sm->fault(sm, vmf->vma, vmf);
Some of the other flags seem odd too. eg. VM_IO ? VM_DONTDUMP ?
Yeah, so: VM_PFNMAP | VM_IO is a protection from remote access on pages. So one can't access such page with ptrace(), /proc/$pid/mem or process_vm_write(). Otherwise, it would create COW mapping and the tracee will stop working with stale vvar.
VM_DONTDUMP restricts the area from coredumping and gdb will also avoid accessing those[1][2].
I agree that VM_PFNMAP was probably excessive in this patch alone and rather synchronized code with other architectures, but it makes more sense now in the new patches set by Christophe: https://lore.kernel.org/linux-arch/cover.1617209141.git.christophe.leroy@csg...
[1] https://lore.kernel.org/lkml/550731AF.6080904@redhat.com/T/ [2] https://sourceware.org/legacy-ml/gdb-patches/2015-03/msg00383.html
Thanks, Dmitry
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks!
Cc: Andrei Vagin avagin@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Paul Mackerras paulus@samba.org Cc: linuxppc-dev@lists.ozlabs.org Cc: stable@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov dima@arista.com Tested-by: Christophe Leroy christophe.leroy@csgroup.eu
arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..4bc45d3ed8b0 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) {
- unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE;
- unsigned long vdso_base = (unsigned long)mm->context.vdso;
if (start <= vdso_base && vdso_base < end) mm->context.vdso = NULL; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e839a906fdf2..b14907209822 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc { unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
- if (new_size != text_size + PAGE_SIZE)
- if (new_size != text_size) return -EINVAL;
In ARM64 you have removed the above test in commit 871402e05b24cb56 ("mm: forbid splitting special mappings"). Do we need to keep it here ?
- current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE;
- current->mm->context.vdso = (void __user *)new_vma->vm_start;
return 0; }
On 3/30/21 11:17 AM, Christophe Leroy wrote:
Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
[..]
--- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc { unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - if (new_size != text_size + PAGE_SIZE) + if (new_size != text_size) return -EINVAL;
In ARM64 you have removed the above test in commit 871402e05b24cb56 ("mm: forbid splitting special mappings"). Do we need to keep it here ?
- current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE; + current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; }
Yes, right you are, this can be dropped.
Thanks, Dmitry
On Fri, 26 Mar 2021 19:17:20 +0000, Dmitry Safonov wrote:
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected).
[...]
Applied to powerpc/next.
[1/1] powerpc/vdso: Separate vvar vma from vdso https://git.kernel.org/powerpc/c/1c4bce6753857dc409a0197342d18764e7f4b741
cheers
linux-stable-mirror@lists.linaro.org