From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v2.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
in a FRED system. Handle the FRED system scenario too. There are two patches in this series. Comments welcome...
Note: This patchset is only tested for 'syscall' sets %rcx=%rip and %r11=%rflags case. I don't have a FRED system to test it.
How to test this:
$ make -C tools/testing/selftests/x86 $ tools/testing/selftests/x86/sysret_rip_64
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (hpa). (Ref: https://lore.kernel.org/lkml/8f5c24df-514d-5d89-f58f-ec8c3eb1e049@zytor.com )
---
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 105 ++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts %r11 == %rflags after the 'syscall' returns to user. Such an assertion doesn't apply to a FRED system because in a FRED system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: "H. Peter Anvin" hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Need hpa's sign off.
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (hpa). (Ref: https://lore.kernel.org/lkml/8f5c24df-514d-5d89-f58f-ec8c3eb1e049@zytor.com )
tools/testing/selftests/x86/sysret_rip.c | 96 +++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..550bc4e69ac46997 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,95 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_ERROR = -1, /* Invalid register contents */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +/* + * Returns: + * 0 = %rcx and %r11 preserved. + * 1 = %rcx and %r11 set to %rflags and %rip. + * -1 = %rcx and/or %r11 set to any other values. + * + * Note that check_regs_syscall() sets %rbx to the syscall return %rip. + */ +static enum regs_ok check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + if (r11 == r11_sentinel && rcx == rcx_sentinel) { + return REGS_SAVED; + } else if (r11 == rflags_sentinel && rcx == rbx) { + return REGS_SYSRET; + } else { + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + return REGS_ERROR; + } +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + /* + * Test that: + * + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. + * + */ + assert(check_regs_result(r11, rcx, rbx) != REGS_ERROR); + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -101,11 +190,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) return; }
+static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); +} + static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
On 1/23/23 18:27, Ammar Faizi wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts %r11 == %rflags after the 'syscall' returns to user. Such an assertion doesn't apply to a FRED system because in a FRED system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: "H. Peter Anvin" hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
Need hpa's sign off.
For both patches:
Acked-by: H. Peter Anvin (Intel) hpa@zytor.com
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system; check that it returns a nonnegative value and then save the result.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: "H. Peter Anvin" hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Need hpa's sign off.
tools/testing/selftests/x86/sysret_rip.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 550bc4e69ac46997..75c72d34dbc5840c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -252,8 +252,17 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+static void test_syscall_rcx_r11(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + test_syscall_rcx_r11(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly
On 1/23/23 18:27, Ammar Faizi wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system; check that it returns a nonnegative value and then save the result.
"Nonnegative" here should be "valid"; it is an implementation detail that the error value is -1.
However, you are not checking that you don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point.
-hpa
On Mon, Jan 23, 2023 at 10:16:01PM -0800, H. Peter Anvin wrote:
On 1/23/23 18:27, Ammar Faizi wrote:
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system; check that it returns a nonnegative value and then save the result.
"Nonnegative" here should be "valid"; it is an implementation detail that the error value is -1.
Copy-paste error, will fix that!
However, you are not checking that you don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point.
Good point!
What do you think of adding this on top of patch #1?
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 75c72d34dbc5840c..3da827713831acbc 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -47,11 +47,14 @@ static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; static const unsigned long rflags_sentinel = 0x200a93;
enum regs_ok { - REGS_ERROR = -1, /* Invalid register contents */ - REGS_SAVED = 0, /* Registers properly preserved */ - REGS_SYSRET = 1 /* Registers match syscall/sysret */ + REGS_INIT_VAL = -2, /* For init value checker, never returned */ + REGS_ERROR = -1, /* Invalid register contents */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ };
+static enum regs_ok regs_ok_state = REGS_INIT_VAL; + /* * Returns: * 0 = %rcx and %r11 preserved. @@ -86,6 +89,7 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, register unsigned long r9 asm("%r9"); register void *rsp asm("%rsp"); unsigned long rcx, rbx; + enum regs_ok ret;
r11 = r11_sentinel; rcx = rcx_sentinel; @@ -124,7 +128,14 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. * */ - assert(check_regs_result(r11, rcx, rbx) != REGS_ERROR); + ret = check_regs_result(r11, rcx, rbx); + assert(ret != REGS_ERROR); + + if (regs_ok_state == REGS_INIT_VAL) + regs_ok_state = ret; + else + assert(ret == regs_ok_state); + return nr_syscall; }
On Tue, Jan 24, 2023 at 01:41:30PM +0700, Ammar Faizi wrote:
On Mon, Jan 23, 2023 at 10:16:01PM -0800, H. Peter Anvin wrote:
However, you are not checking that you don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point.
Good point!
What do you think of adding this on top of patch #1?
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 75c72d34dbc5840c..3da827713831acbc 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -47,11 +47,14 @@ static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; static const unsigned long rflags_sentinel = 0x200a93; enum regs_ok {
- REGS_ERROR = -1, /* Invalid register contents */
- REGS_SAVED = 0, /* Registers properly preserved */
- REGS_SYSRET = 1 /* Registers match syscall/sysret */
- REGS_INIT_VAL = -2, /* For init value checker, never returned */
- REGS_ERROR = -1, /* Invalid register contents */
- REGS_SAVED = 0, /* Registers properly preserved */
- REGS_SYSRET = 1 /* Registers match syscall/sysret */
}; +static enum regs_ok regs_ok_state = REGS_INIT_VAL;
/*
- Returns:
- 0 = %rcx and %r11 preserved.
@@ -86,6 +89,7 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, register unsigned long r9 asm("%r9"); register void *rsp asm("%rsp"); unsigned long rcx, rbx;
- enum regs_ok ret;
r11 = r11_sentinel; rcx = rcx_sentinel; @@ -124,7 +128,14 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. * */
- assert(check_regs_result(r11, rcx, rbx) != REGS_ERROR);
- ret = check_regs_result(r11, rcx, rbx);
- assert(ret != REGS_ERROR);
- if (regs_ok_state == REGS_INIT_VAL)
regs_ok_state = ret;
- else
assert(ret == regs_ok_state);
- return nr_syscall;
}
And oh, on top of that. Add a comment, just to make it clear...
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 3da827713831acbc..3d783f5baee1b66a 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -131,6 +131,10 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, ret = check_regs_result(r11, rcx, rbx); assert(ret != REGS_ERROR);
+ /* + * Check that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * Need at least 2 times 'syscall' invoked from this function. + */ if (regs_ok_state == REGS_INIT_VAL) regs_ok_state = ret; else
On January 23, 2023 10:41:20 PM PST, Ammar Faizi ammarfaizi2@gnuweeb.org wrote:
On Mon, Jan 23, 2023 at 10:16:01PM -0800, H. Peter Anvin wrote:
On 1/23/23 18:27, Ammar Faizi wrote:
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system; check that it returns a nonnegative value and then save the result.
"Nonnegative" here should be "valid"; it is an implementation detail that the error value is -1.
Copy-paste error, will fix that!
However, you are not checking that you don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point.
Good point!
What do you think of adding this on top of patch #1?
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 75c72d34dbc5840c..3da827713831acbc 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -47,11 +47,14 @@ static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; static const unsigned long rflags_sentinel = 0x200a93;
enum regs_ok {
- REGS_ERROR = -1, /* Invalid register contents */
- REGS_SAVED = 0, /* Registers properly preserved */
- REGS_SYSRET = 1 /* Registers match syscall/sysret */
- REGS_INIT_VAL = -2, /* For init value checker, never returned */
- REGS_ERROR = -1, /* Invalid register contents */
- REGS_SAVED = 0, /* Registers properly preserved */
- REGS_SYSRET = 1 /* Registers match syscall/sysret */
};
+static enum regs_ok regs_ok_state = REGS_INIT_VAL;
/*
- Returns:
- 0 = %rcx and %r11 preserved.
@@ -86,6 +89,7 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, register unsigned long r9 asm("%r9"); register void *rsp asm("%rsp"); unsigned long rcx, rbx;
enum regs_ok ret;
r11 = r11_sentinel; rcx = rcx_sentinel;
@@ -124,7 +128,14 @@ static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. * */
- assert(check_regs_result(r11, rcx, rbx) != REGS_ERROR);
- ret = check_regs_result(r11, rcx, rbx);
- assert(ret != REGS_ERROR);
- if (regs_ok_state == REGS_INIT_VAL)
regs_ok_state = ret;
- else
assert(ret == regs_ok_state);
- return nr_syscall;
}
Works for me, although I would call it REGS_UNDEFINED myself.
On Tue, Jan 24, 2023 at 01:07:38AM -0800, H. Peter Anvin wrote:
Works for me, although I would call it REGS_UNDEFINED myself.
I'll call it REGS_UNDEFINED too, in v3.
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v3: sysret_rip test update for Intel FRED architecture.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
in a FRED system. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %r11=%rflags.
There are two patches in this series.
How to test this:
$ make -C tools/testing/selftests/x86 $ tools/testing/selftests/x86/sysret_rip_64
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
## Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (hpa).
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (hpa). (Ref: https://lore.kernel.org/lkml/8f5c24df-514d-5d89-f58f-ec8c3eb1e049@zytor.com )
---
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 120 ++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts %r11 == %rflags after the 'syscall' returns to user. Such an assertion doesn't apply to a FRED system because in a FRED system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 111 ++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..b0d271c19ddd7834 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,110 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_UNDEFINED = -2, /* For init value checker, never returned */ + REGS_ERROR = -1, /* Invalid register contents */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +static enum regs_ok regs_ok_state = REGS_UNDEFINED; + +/* + * Returns: + * 0 = %rcx and %r11 preserved. + * 1 = %rcx and %r11 set to %rflags and %rip. + * -1 = %rcx and/or %r11 set to any other values. + * + * Note that check_regs_syscall() sets %rbx to the syscall return %rip. + */ +static enum regs_ok check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + if (r11 == r11_sentinel && rcx == rcx_sentinel) { + return REGS_SAVED; + } else if (r11 == rflags_sentinel && rcx == rbx) { + return REGS_SYSRET; + } else { + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + return REGS_ERROR; + } +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + enum regs_ok ret; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + /* + * Test that: + * + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. + * + */ + ret = check_regs_result(r11, rcx, rbx); + assert(ret != REGS_ERROR); + + /* + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * Need at least 2 times 'syscall' invoked from this function. + */ + if (regs_ok_state == REGS_UNDEFINED) + regs_ok_state = ret; + else + assert(ret == regs_ok_state); + + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -101,11 +205,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) return; }
+static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); +} + static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index b0d271c19ddd7834..bf90fac95a264e2d 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -267,8 +267,17 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+static void test_syscall_rcx_r11(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + test_syscall_rcx_r11(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly
On 1/24/23 02:09, Ammar Faizi wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
Add to the description that the purpose of this is to ensure that various system calls are *consistent*, as per the comment immediately below your code.
-hpa
On Tue, Jan 24, 2023 at 12:59:23PM -0800, H. Peter Anvin wrote:
On 1/24/23 02:09, Ammar Faizi wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with a trivial system call like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
Add to the description that the purpose of this is to ensure that various system calls are *consistent*, as per the comment immediately below your code.
Added in v4.
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v3: sysret_rip test update for Intel FRED architecture.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
We need to remove or change this assertion, maybe: assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11] || r11_sentinel == ctx->uc_mcontext.gregs[REG_R11]);
in a FRED system. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %r11=%rflags.
There are two patches in this series.
How to test this:
$ make -C tools/testing/selftests/x86 $ tools/testing/selftests/x86/sysret_rip_64
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141- 9c001c2343af@intel.com Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23- 786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
## Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (hpa).
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (hpa). (Ref: https://lore.kernel.org/lkml/8f5c24df-514d-5d89-f58f-
ec8c3eb1e049@zytor.com )
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 120 ++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
Ammar Faizi
On January 24, 2023 1:32:14 PM PST, "Li, Xin3" xin3.li@intel.com wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v3: sysret_rip test update for Intel FRED architecture.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
We need to remove or change this assertion, maybe: assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11] || r11_sentinel == ctx->uc_mcontext.gregs[REG_R11]);
in a FRED system. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %r11=%rflags.
There are two patches in this series.
How to test this:
$ make -C tools/testing/selftests/x86 $ tools/testing/selftests/x86/sysret_rip_64
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141- 9c001c2343af@intel.com Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23- 786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Acked-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
## Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (hpa).
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (hpa). (Ref: https://lore.kernel.org/lkml/8f5c24df-514d-5d89-f58f-
ec8c3eb1e049@zytor.com )
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 120 ++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
Ammar Faizi
This should use check_regs_result() – which is exactly the reason I made that a separate function.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
We need to remove or change this assertion, maybe: assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11] || r11_sentinel == ctx->uc_mcontext.gregs[REG_R11]);
in a FRED system. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %r11=%rflags.
This should use check_regs_result() – which is exactly the reason I made that a separate function.
Exactly.
On Tue, Jan 24, 2023 at 01:37:43PM -0800, H. Peter Anvin wrote:
On January 24, 2023 1:32:14 PM PST, "Li, Xin3" xin3.li@intel.com wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v3: sysret_rip test update for Intel FRED architecture.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
We need to remove or change this assertion, maybe: assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11] || r11_sentinel == ctx->uc_mcontext.gregs[REG_R11]);
This should use check_regs_result() – which is exactly the reason I made that a separate function.
Fixed in v4.
On 24/01/2023 9:32 pm, Li, Xin3 wrote:
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v3: sysret_rip test update for Intel FRED architecture.
Xin Li reported sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
Is this under SIMICS, or elsewhere? It's doubly weird that flags/%r11 match, but %rip/%rcx don't.
~Andrew
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
On FRED systems, flags is 0x200a93 and r11 is 0xfeedfacedeadbeef here.
Is this under SIMICS, or elsewhere? It's doubly weird that flags/%r11 match, but %rip/%rcx don't.
This is on Intel Simics, with FRED enabled.
Flags and %r11 don’t match on FRED, and %rip and %rcx also don't match.
I think it's expected.
Xin
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v4. There are two patches in this series.
Xin Li reported that the sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
on the Intel FRED architecture. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %rcx=%rip and %r11=%rflags.
Syscall and sysenter in a FRED system are treated equivalently to software interrupts, e.g. INT 0x80. They do not modify any registers.
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com
#### Changelog v4:
- Fix the assertion condition inside the SIGUSR1 handler (Xin Li).
- Explain the purpose of patch #2 in the commit message (HPA).
- Update commit message (Ammar).
- Repeat test_syscall_rcx_r11_consistent() 32 times to be more sure that the result is really consistent (Ammar).
#### Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (HPA).
#### Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (HPA).
Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 147 +++++++++++++++++++++-- 1 file changed, 138 insertions(+), 9 deletions(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts (%r11 == %rflags) after the 'syscall' returns to user. Such an assertion doesn't apply to the FRED system because in that system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 127 +++++++++++++++++++++-- 1 file changed, 120 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..86a31bbac9a85a88 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,113 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_UNDEFINED = -2, /* For consistency checker init, never returned */ + REGS_ERROR = -1, /* Invalid register contents */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +/* + * Returns: + * 0 = %rcx and %r11 preserved. + * 1 = %rcx and %r11 set to %rflags and %rip. + * -1 = %rcx and/or %r11 set to any other values. + * + * @rbx should be set to the syscall return %rip. + */ +static enum regs_ok check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + if (r11 == r11_sentinel && rcx == rcx_sentinel) + return REGS_SAVED; + + if (r11 == rflags_sentinel && rcx == rbx) + return REGS_SYSRET; + + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + return REGS_ERROR; +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + static enum regs_ok regs_ok_state = REGS_UNDEFINED; + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + enum regs_ok ret; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + /* + * Test that: + * + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. + */ + ret = check_regs_result(r11, rcx, rbx); + assert(ret != REGS_ERROR); + + /* + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * It needs at least calling do_syscall() twice to assert. + */ + if (regs_ok_state == REGS_UNDEFINED) { + /* + * First time calling do_syscall(). + */ + regs_ok_state = ret; + return ret; + } else { + assert(regs_ok_state == ret); + } + + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -85,27 +192,33 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + enum regs_ok ret;
memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); + + assert(ret != REGS_ERROR); + /* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip; - - /* R11 and EFLAGS should already match. */ - assert(ctx->uc_mcontext.gregs[REG_EFL] == - ctx->uc_mcontext.gregs[REG_R11]); - sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
- return; +} + +static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); }
static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
On Wed, Jan 25, 2023 at 10:22:39AM +0700, Ammar Faizi wrote:
- /*
* Test that we don't get a mix of REGS_SAVED and REGS_SYSRET.
* It needs at least calling do_syscall() twice to assert.
*/
- if (regs_ok_state == REGS_UNDEFINED) {
/*
* First time calling do_syscall().
*/
regs_ok_state = ret;
return ret;
- } else {
Oops, this should not be returning ret here. Ignore this version. I'll send v5.
On Wed, Jan 25, 2023 at 10:37:17AM +0700, Ammar Faizi wrote:
On Wed, Jan 25, 2023 at 10:22:39AM +0700, Ammar Faizi wrote:
- /*
* Test that we don't get a mix of REGS_SAVED and REGS_SYSRET.
* It needs at least calling do_syscall() twice to assert.
*/
- if (regs_ok_state == REGS_UNDEFINED) {
/*
* First time calling do_syscall().
*/
regs_ok_state = ret;
return ret;
- } else {
Oops, this should not be returning ret here. Ignore this version. I'll send v5.
Side note: This mistake doesn't affect the test correctness because we currenly don't use the return value of do_syscall().
But still worth fixing...
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- REGS_SAVED: "syscall" in a FRED system doesn't clobber %rcx and %r11.
- REGS_SYSRET: "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with trivial system calls like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Goals of this test:
- Ensure that the syscall behavior is consistent. It should be either always REGS_SAVED or always REGS_SYSRET. Not a mix of them.
- Ensure that the kernel doesn't leak its internal data when returning to userspace.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 86a31bbac9a85a88..5b8576147c92dbbd 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -271,8 +271,24 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+/* + * Ensure that various system calls are consistent. + * We should not get a mix of REGS_SAVED and REGS_SYSRET. + */ +static void test_syscall_rcx_r11_consistent(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + int i; + + for (i = 0; i < 32; i++) + test_syscall_rcx_r11_consistent(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly @@ -280,7 +296,7 @@ int main() * it'll crash on Intel CPUs. */ sethandler(SIGUSR1, sigusr1, 0); - for (int i = 47; i < 64; i++) + for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
clearhandler(SIGUSR1); @@ -291,7 +307,7 @@ int main() test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
/* These are the interesting cases. */ - for (int i = 47; i < 64; i++) { + for (i = 47; i < 64; i++) { test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE); test_syscall_fallthrough_to(1UL<<i); }
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v5. There are two patches in this series.
Xin Li reported that the sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
on the Intel FRED architecture. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %rcx=%rip and %r11=%rflags.
Syscall and sysenter in a FRED system are treated equivalently to software interrupts, e.g. INT 0x80. They do not modify any registers.
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com
#### Changelog v5:
- Fix do_syscall() return value (Ammar).
#### Changelog v4:
- Fix the assertion condition inside the SIGUSR1 handler (Xin Li).
- Explain the purpose of patch #2 in the commit message (HPA).
- Update commit message (Ammar).
- Repeat test_syscall_rcx_r11_consistent() 32 times to be more sure that the result is really consistent (Ammar).
#### Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (HPA).
#### Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (HPA).
Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 146 +++++++++++++++++++++-- 1 file changed, 137 insertions(+), 9 deletions(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts (%r11 == %rflags) after the 'syscall' returns to user. Such an assertion doesn't apply to the FRED system because in that system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 126 +++++++++++++++++++++-- 1 file changed, 119 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..d3dc5ec496854179 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,112 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_UNDEFINED = -2, /* For consistency checker init, never returned */ + REGS_ERROR = -1, /* Invalid register contents */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +/* + * Returns: + * 0 = %rcx and %r11 preserved. + * 1 = %rcx and %r11 set to %rflags and %rip. + * -1 = %rcx and/or %r11 set to any other values. + * + * @rbx should be set to the syscall return %rip. + */ +static enum regs_ok check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + if (r11 == r11_sentinel && rcx == rcx_sentinel) + return REGS_SAVED; + + if (r11 == rflags_sentinel && rcx == rbx) + return REGS_SYSRET; + + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + return REGS_ERROR; +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + static enum regs_ok regs_ok_state = REGS_UNDEFINED; + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + enum regs_ok ret; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + /* + * Test that: + * + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. + */ + ret = check_regs_result(r11, rcx, rbx); + assert(ret != REGS_ERROR); + + /* + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * It needs at least calling do_syscall() twice to assert. + */ + if (regs_ok_state == REGS_UNDEFINED) { + /* + * First time calling do_syscall(). + */ + regs_ok_state = ret; + } else { + assert(regs_ok_state == ret); + } + + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -85,27 +191,33 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + enum regs_ok ret;
memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); + + assert(ret != REGS_ERROR); + /* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip; - - /* R11 and EFLAGS should already match. */ - assert(ctx->uc_mcontext.gregs[REG_EFL] == - ctx->uc_mcontext.gregs[REG_R11]); - sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
- return; +} + +static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); }
static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
On 1/24/23 19:49, Ammar Faizi wrote:
- /*
* Test that:
*
* - "syscall" in a FRED system doesn't clobber %rcx and %r11.
* - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
*/
- ret = check_regs_result(r11, rcx, rbx);
- assert(ret != REGS_ERROR);
- /*
* Test that we don't get a mix of REGS_SAVED and REGS_SYSRET.
* It needs at least calling do_syscall() twice to assert.
*/
- if (regs_ok_state == REGS_UNDEFINED) {
/*
* First time calling do_syscall().
*/
regs_ok_state = ret;
- } else {
assert(regs_ok_state == ret);
- }
[...]
- ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11],
ctx->uc_mcontext.gregs[REG_RCX],
ctx->uc_mcontext.gregs[REG_RBX]);
- assert(ret != REGS_ERROR);
This instance, too, needs to be checked against regs_ok_result. It would make most sense to move that handling, and the assert() into check_regs_result() or into a separate function around it.
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
-hpa
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote:
- ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11],
ctx->uc_mcontext.gregs[REG_RCX],
ctx->uc_mcontext.gregs[REG_RBX]);
- assert(ret != REGS_ERROR);
This instance, too, needs to be checked against regs_ok_result. It would make most sense to move that handling, and the assert() into check_regs_result() or into a separate function around it.
OK. Sounds better.
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
We will have a new separate patch for that.
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
Just to clarify this more so I am sure I understand it correctly.
Did you mean to have the same signal handler without modifiying 'REG_RCX' but still change 'REG_RIP'?
IOW, we want to only *remove*:
ctx->uc_mcontext.gregs[REG_RCX] = rip;
and *keep*:
ctx->uc_mcontext.gregs[REG_RIP] = rip;
for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the iret path because %rcx != %r11 upon rt_sigreturn()?
On 1/25/23 4:57 PM, Ammar Faizi wrote:
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
Just to clarify this more so I am sure I understand it correctly.
Did you mean to have the same signal handler without modifiying 'REG_RCX' but still change 'REG_RIP'?
IOW, we want to only *remove*:
ctx->uc_mcontext.gregs[REG_RCX] = rip;
and *keep*:
ctx->uc_mcontext.gregs[REG_RIP] = rip;
for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the iret path because %rcx != %r11 upon rt_sigreturn()?
s/%rcx != %r11/%rcx != %rip/
On January 25, 2023 1:57:15 AM PST, Ammar Faizi ammarfaizi2@gnuweeb.org wrote:
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
Just to clarify this more so I am sure I understand it correctly.
Did you mean to have the same signal handler without modifiying 'REG_RCX' but still change 'REG_RIP'?
IOW, we want to only *remove*:
ctx->uc_mcontext.gregs[REG_RCX] = rip;
and *keep*:
ctx->uc_mcontext.gregs[REG_RIP] = rip;
for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the iret path because %rcx != %r11 upon rt_sigreturn()?
I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged.
On Wed, Jan 25, 2023 at 02:17:41AM -0800, H. Peter Anvin wrote:
I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged.
Ah okay... I think I understand now. My confusion came from a comment in that code.
The current SIGUSR1 handler has a comment:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
So I thought if we leave them both unchanged, then SYSRET can happen too, because IP and CX match. My initial confusion about that was:
Where do we actually exercise IRET if the SIGUSR2 handler exercises SYSRET then?
I realized my assumption was wrong. The current SIGUSR1 handler actually forces the kernel to use IRET, not SYSRET. Because the %rip is set to a non-canonical address. So that's the place where it exercises IRET.
IOW, my understanding now:
The current SIGUSR1 handler exercises the SYSRET-appropriate condition detector in the kernel. It doesn't actually go to the SYSRET path despite the comment saying "SYSRET can happen". That detector must take us to the IRET path or we will #GP in kernel space on Intel CPUs.
In short, the SIGUSR1 handler asserts that "SYSRET must *not* happen".
The expected SIGUSR2 handler addition exercises the SYSRET path by leaving REG_IP and REG_CX unchanged.
Am I correct?
On January 25, 2023 3:37:23 AM PST, Ammar Faizi ammarfaizi2@gnuweeb.org wrote:
On Wed, Jan 25, 2023 at 02:17:41AM -0800, H. Peter Anvin wrote:
I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged.
Ah okay... I think I understand now. My confusion came from a comment in that code.
The current SIGUSR1 handler has a comment:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
So I thought if we leave them both unchanged, then SYSRET can happen too, because IP and CX match. My initial confusion about that was:
Where do we actually exercise IRET if the SIGUSR2 handler exercises SYSRET then?
I realized my assumption was wrong. The current SIGUSR1 handler actually forces the kernel to use IRET, not SYSRET. Because the %rip is set to a non-canonical address. So that's the place where it exercises IRET.
IOW, my understanding now:
The current SIGUSR1 handler exercises the SYSRET-appropriate condition detector in the kernel. It doesn't actually go to the SYSRET path despite the comment saying "SYSRET can happen". That detector must take us to the IRET path or we will #GP in kernel space on Intel CPUs.
In short, the SIGUSR1 handler asserts that "SYSRET must *not* happen".
The expected SIGUSR2 handler addition exercises the SYSRET path by leaving REG_IP and REG_CX unchanged.
Am I correct?
That's the idea.
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- REGS_SAVED: "syscall" in a FRED system doesn't clobber %rcx and %r11.
- REGS_SYSRET: "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with trivial system calls like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Goals of this test:
- Ensure that the syscall behavior is consistent. It should be either always REGS_SAVED or always REGS_SYSRET. Not a mix of them.
- The kernel doesn't leak its internal data when returning to userspace.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index d3dc5ec496854179..7ebfb603bf45fa0a 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -270,8 +270,24 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+/* + * Ensure that various system calls are consistent. + * We should not get a mix of REGS_SAVED and REGS_SYSRET. + */ +static void test_syscall_rcx_r11_consistent(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + int i; + + for (i = 0; i < 32; i++) + test_syscall_rcx_r11_consistent(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly @@ -279,7 +295,7 @@ int main() * it'll crash on Intel CPUs. */ sethandler(SIGUSR1, sigusr1, 0); - for (int i = 47; i < 64; i++) + for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
clearhandler(SIGUSR1); @@ -290,7 +306,7 @@ int main() test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
/* These are the interesting cases. */ - for (int i = 47; i < 64; i++) { + for (i = 47; i < 64; i++) { test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE); test_syscall_fallthrough_to(1UL<<i); }
This version passes on FRED, thanks a lot for quickly fixing it.
Xin
From: Ammar Faizi ammarfaizi2@gnuweeb.org
This is an RFC patchset v5. There are two patches in this series.
Xin Li reported that the sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
on the Intel FRED architecture. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %rcx=%rip and %r11=%rflags.
Syscall and sysenter in a FRED system are treated equivalently to software interrupts, e.g. INT 0x80. They do not modify any registers.
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141- 9c001c2343af@intel.com
#### Changelog v5:
- Fix do_syscall() return value (Ammar).
#### Changelog v4:
Fix the assertion condition inside the SIGUSR1 handler (Xin Li).
Explain the purpose of patch #2 in the commit message (HPA).
Update commit message (Ammar).
Repeat test_syscall_rcx_r11_consistent() 32 times to be more sure that the result is really consistent (Ammar).
#### Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (HPA).
#### Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (HPA).
Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org
Ammar Faizi (2): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11`
tools/testing/selftests/x86/sysret_rip.c | 146 +++++++++++++++++++++-- 1 file changed, 137 insertions(+), 9 deletions(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
Ammar Faizi
On Wed, Jan 25, 2023 at 08:22:48AM +0000, Li, Xin3 wrote:
This version passes on FRED, thanks a lot for quickly fixing it.
Great!
Can you pick these two patches and include it in the next version of "x86: enable FRED for x86-64" RFC patchset?
This version passes on FRED, thanks a lot for quickly fixing it.
Great!
Can you pick these two patches and include it in the next version of "x86: enable FRED for x86-64" RFC patchset?
Would it be better to get this patch set merged first?
Otherwise surely I will include it in the FRED patch set.
Xin
On January 25, 2023 9:07:18 AM PST, "Li, Xin3" xin3.li@intel.com wrote:
This version passes on FRED, thanks a lot for quickly fixing it.
Great!
Can you pick these two patches and include it in the next version of "x86: enable FRED for x86-64" RFC patchset?
Would it be better to get this patch set merged first?
Otherwise surely I will include it in the FRED patch set.
Xin
If the maintainers are ok with it, it would be better to merge it sooner: once we have agreed on the semantics, which I believe we have, we should be testing those semantics and nothing else.
On Wed, Jan 25, 2023 at 09:24:40AM -0800, H. Peter Anvin wrote:
On January 25, 2023 9:07:18 AM PST, "Li, Xin3" xin3.li@intel.com wrote:
Would it be better to get this patch set merged first?
Otherwise surely I will include it in the FRED patch set.
If the maintainers are ok with it, it would be better to merge it sooner: once we have agreed on the semantics, which I believe we have, we should be testing those semantics and nothing else.
OK, let's keep this patchset separated from the FRED support patchset.
In the meantime, let me address the recent HPA's comments.
Would it be better to get this patch set merged first?
Otherwise surely I will include it in the FRED patch set.
If the maintainers are ok with it, it would be better to merge it sooner: once we have agreed on the semantics, which I believe we have, we should be testing those semantics and nothing else.
OK, let's keep this patchset separated from the FRED support patchset.
Thanks!
This patch set first makes the R11/RCX semantics clearer, and it BTW fixes FRED tests.
To me it's more of an improvement to the existing code.
Would it be better to get this patch set merged first?
Otherwise surely I will include it in the FRED patch set.
If the maintainers are ok with it, it would be better to merge it sooner: once we have agreed on the semantics, which I believe we have, we should be testing those semantics and nothing else.
OK, let's keep this patchset separated from the FRED support patchset.
Thanks!
This patch set first makes the R11/RCX semantics clearer, and it BTW fixes FRED tests.
To me it's more of an improvement to the existing code.
Hi Faizi,
Any update on this patch set?
Thanks! Xin
On Wed, Feb 15, 2023 at 07:42:47AM +0000, Li, Xin3 wrote:
Hi Faizi,
Any update on this patch set?
Xin,
Before I send the next version, I need an answer for this one: https://lore.kernel.org/lkml/Y9LfmQ%2Fr1%2FpEP+uv@biznet-home.integral.gnuwe...
I don't think the redzone problem is handled correctly here. Using "+r" (rsp) constraint doesn't solve the redzone problem.
HPA, Andrew, anybody?
On Wed, Feb 15, 2023 at 07:42:47AM +0000, Li, Xin3 wrote:
Hi Faizi,
Any update on this patch set?
No comment from HPA. But after the recent discussion with Andrew, I think at least it's now clear that we are not going to use "+r"(rsp) to avoid the red zone problem.
I am on leave today. Will send revision v8 on Monday.
Thanks,
On February 17, 2023 8:27:40 PM PST, Ammar Faizi ammarfaizi2@gnuweeb.org wrote:
On Wed, Feb 15, 2023 at 07:42:47AM +0000, Li, Xin3 wrote:
Hi Faizi,
Any update on this patch set?
No comment from HPA. But after the recent discussion with Andrew, I think at least it's now clear that we are not going to use "+r"(rsp) to avoid the red zone problem.
I am on leave today. Will send revision v8 on Monday.
Thanks,
My apologies, I missed your latest response in the torrent of email. The redzone issue is weird; it ought to be breaking all over the place, not just this.
Let me take a quick look at it...
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Hi,
This is an RFC patchset v6. There are three patches in this series.
Xin Li reported that the sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
on the Intel FRED architecture. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %rcx=%rip and %r11=%rflags.
Syscall and sysenter in a FRED system are treated equivalently to software interrupts, e.g. INT 0x80. They do not modify any registers.
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com
## Changelog v6:
- Move the check-regs assertion in sigusr1() to check_regs_result() (HPA).
- Add a new test just like sigusr1(), but don't modify REG_RCX and REG_R11. This is used to test SYSRET behavior consistency (HPA).
## Changelog v5:
- Fix do_syscall() return value (Ammar).
## Changelog v4:
- Fix the assertion condition inside the SIGUSR1 handler (Xin Li).
- Explain the purpose of patch #2 in the commit message (HPA).
- Update commit message (Ammar).
- Repeat test_syscall_rcx_r11_consistent() 32 times to be more sure that the result is really consistent (Ammar).
## Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (HPA).
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (HPA).
Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Ammar Faizi (3): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11` selftests/x86: sysret_rip: Test opportunistic SYSRET
tools/testing/selftests/x86/sysret_rip.c | 177 +++++++++++++++++++++-- 1 file changed, 168 insertions(+), 9 deletions(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
The current selftest asserts (%r11 == %rflags) after the 'syscall' returns to user. Such an assertion doesn't apply to the FRED system because in that system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 120 +++++++++++++++++++++-- 1 file changed, 113 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..100f55981d77a29b 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,110 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_UNDEFINED = -1, /* For consistency checker init, never returned */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +/* + * REGS_SAVED = %rcx and %r11 preserved. + * REGS_SYSRET = %rcx and %r11 set to %rflags and %rip. + * REGS_ERROR = %rcx and/or %r11 set to any other values. + * + * @rbx should be set to the syscall return %rip. + */ +static void check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + static enum regs_ok regs_ok_state = REGS_UNDEFINED; + enum regs_ok ret; + + /* + * Test that: + * + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. + */ + if (r11 == r11_sentinel && rcx == rcx_sentinel) { + ret = REGS_SAVED; + } else if (r11 == rflags_sentinel && rcx == rbx) { + ret = REGS_SYSRET; + } else { + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + exit(1); + } + + + /* + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * It needs at least calling check_regs_result() twice to assert. + */ + if (regs_ok_state == REGS_UNDEFINED) { + /* + * First time calling check_regs_result(). + */ + regs_ok_state = ret; + } else { + assert(regs_ok_state == ret); + } +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + check_regs_result(r11, rcx, rbx); + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -88,24 +192,26 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); + /* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip; - - /* R11 and EFLAGS should already match. */ - assert(ctx->uc_mcontext.gregs[REG_EFL] == - ctx->uc_mcontext.gregs[REG_R11]); - sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); +}
- return; +static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); }
static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
On Thu, Jan 26, 2023 at 04:17:12AM +0700, Ammar Faizi wrote:
+/*
- REGS_SAVED = %rcx and %r11 preserved.
- REGS_SYSRET = %rcx and %r11 set to %rflags and %rip.
- REGS_ERROR = %rcx and/or %r11 set to any other values.
Since we moved the assertion into check_regs_result(), we no longer need REGS_ERROR. check_regs_result() is now a void function.
Will remove that comment...
Test that:
- REGS_SAVED: "syscall" in a FRED system doesn't clobber %rcx and %r11.
- REGS_SYSRET: "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with trivial system calls like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Goals of this test:
- Ensure that the syscall behavior is consistent. It should be either always REGS_SAVED or always REGS_SYSRET. Not a mix of them.
- The kernel doesn't leak its internal data when returning to userspace.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 100f55981d77a29b..d45b7f0147cd25ad 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -264,8 +264,24 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+/* + * Ensure that various system calls are consistent. + * We should not get a mix of REGS_SAVED and REGS_SYSRET. + */ +static void test_syscall_rcx_r11_consistent(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + int i; + + for (i = 0; i < 32; i++) + test_syscall_rcx_r11_consistent(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly @@ -273,7 +289,7 @@ int main() * it'll crash on Intel CPUs. */ sethandler(SIGUSR1, sigusr1, 0); - for (int i = 47; i < 64; i++) + for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
clearhandler(SIGUSR1); @@ -284,7 +300,7 @@ int main() test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
/* These are the interesting cases. */ - for (int i = 47; i < 64; i++) { + for (i = 47; i < 64; i++) { test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE); test_syscall_fallthrough_to(1UL<<i); }
When run on a non-FRED system, test the opportunistic SYSRET fast-path. Make sure the %rcx/%r11 clobbering behavior is consistent.
When run on a FRED system, test that %rcx/%r11 are preserved when invoking syscall.
This is similar to what test_syscall_rcx_r11_consistent() is doing, but with addition it's done via the SIGUSR2 signal handler.
Link: https://lore.kernel.org/lkml/8770815f-0f23-d0c5-e56a-d401827842c9@zytor.com Suggested-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
On Wed, 25 Jan 2023 00:39:26 -0800, "H. Peter Anvin" wrote:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
tools/testing/selftests/x86/sysret_rip.c | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index d45b7f0147cd25ad..a1e5ec6f08bcd523 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -275,6 +275,28 @@ static void test_syscall_rcx_r11_consistent(void) do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); }
+static unsigned long usr2_rcx; +static unsigned long usr2_r11; + +static void sigusr2(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + usr2_r11 = ctx->uc_mcontext.gregs[REG_R11]; + usr2_rcx = ctx->uc_mcontext.gregs[REG_RCX]; + + check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); +} + +static void test_sysret_consistent(void) +{ + printf("[RUN]\ttest_sysret_consistent\n"); + __raise(SIGUSR2); + printf("[OK]\tRCX = %#lx; R11 = %#lx\n", usr2_rcx, usr2_r11); +} + int main() { int i; @@ -292,6 +314,21 @@ int main() for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
+ /* + * + * When run on a non-FRED system, test the SYSRET path. Make + * sure the %rcx/%r11 clobbering behavior is consistent. + * + * When run on a FRED system, test that %rcx/%r11 are preserved + * when invoking syscall. + * + * This is similar to test_syscall_rcx_r11_consistent(), but via + * a signal handler. + */ + sethandler(SIGUSR2, sigusr2, 0); + for (i = 0; i < 32; i++) + test_sysret_consistent(); + clearhandler(SIGUSR1);
sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Hi,
This is an RFC patchset v7. There are three patches in this series.
Xin Li reported that the sysret_rip test fails at:
assert(ctx->uc_mcontext.gregs[REG_EFL] == ctx->uc_mcontext.gregs[REG_R11]);
on the Intel FRED architecture. Let's handle the FRED system scenario too. The 'syscall' instruction in a FRED system doesn't set %rcx=%rip and %r11=%rflags.
Syscall and sysenter in a FRED system are treated equivalently to software interrupts, e.g. INT 0x80. They do not modify any registers.
Link: https://lore.kernel.org/lkml/5d4ad3e3-034f-c7da-d141-9c001c2343af@intel.com
## Changelog v7:
- Fix comment, REGS_ERROR no longer exists in the enum (Ammar).
- Update commit message (Ammar).
## Changelog v6:
- Move the check-regs assertion in sigusr1() to check_regs_result() (HPA).
- Add a new test just like sigusr1(), but don't modify REG_RCX and REG_R11. This is used to test SYSRET behavior consistency (HPA).
## Changelog v5:
- Fix do_syscall() return value (Ammar).
## Changelog v4:
- Fix the assertion condition inside the SIGUSR1 handler (Xin Li).
- Explain the purpose of patch #2 in the commit message (HPA).
- Update commit message (Ammar).
- Repeat test_syscall_rcx_r11_consistent() 32 times to be more sure that the result is really consistent (Ammar).
## Changelog v3:
- Test that we don't get a mix of REGS_SAVED and REGS_SYSRET, which is a major part of the point (HPA).
## Changelog v2:
- Use "+r"(rsp) as the right way to avoid redzone problems per Andrew's comment (HPA).
Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
Ammar Faizi (3): selftests/x86: sysret_rip: Handle syscall in a FRED system selftests/x86: sysret_rip: Add more syscall tests with respect to `%rcx` and `%r11` selftests/x86: sysret_rip: Test SYSRET with a signal handler
tools/testing/selftests/x86/sysret_rip.c | 171 +++++++++++++++++++++-- 1 file changed, 162 insertions(+), 9 deletions(-)
base-commit: e12ad468c22065a2826b2fc4c11d2113a7975301
From: Ammar Faizi ammarfaizi2@gnuweeb.org
The current selftest asserts (%r11 == %rflags) after the 'syscall' returns to user. Such an assertion doesn't apply to the FRED system because in that system the 'syscall' instruction does not set %r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11. - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those registers. Therefore, create a syscall wrapper in inline Assembly to fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses") Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Reported-by: Xin Li xin3.li@intel.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 114 +++++++++++++++++++++-- 1 file changed, 107 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 84d74be1d90207ab..ef3f492d95f6f2a1 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -39,6 +39,104 @@ asm ( extern const char test_page[]; static void const *current_test_page_addr = test_page;
+/* Arbitrary values */ +static const unsigned long r11_sentinel = 0xfeedfacedeadbeef; +static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e; + +/* An arbitrary *valid* RFLAGS value */ +static const unsigned long rflags_sentinel = 0x200a93; + +enum regs_ok { + REGS_UNDEFINED = -1, /* For consistency checker init, never returned */ + REGS_SAVED = 0, /* Registers properly preserved */ + REGS_SYSRET = 1 /* Registers match syscall/sysret */ +}; + +/* + * @rbx should be set to the syscall return %rip. + */ +static void check_regs_result(unsigned long r11, unsigned long rcx, + unsigned long rbx) +{ + static enum regs_ok regs_ok_state = REGS_UNDEFINED; + enum regs_ok ret; + + /* + * REGS_SAVED = %rcx and %r11 preserved (Intel FRED). + * REGS_SYSRET = %rcx and %r11 set to %rflags and %rip. + */ + if (r11 == r11_sentinel && rcx == rcx_sentinel) { + ret = REGS_SAVED; + } else if (r11 == rflags_sentinel && rcx == rbx) { + ret = REGS_SYSRET; + } else { + printf("[FAIL] check_regs_result\n"); + printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11); + printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx); + printf(" rflags_sentinel = %#lx\n", rflags_sentinel); + exit(1); + } + + + /* + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. + * It needs at least calling check_regs_result() twice to assert. + */ + if (regs_ok_state == REGS_UNDEFINED) { + /* + * First time calling check_regs_result(). + */ + regs_ok_state = ret; + } else { + assert(regs_ok_state == ret); + } +} + +static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + register unsigned long r11 asm("%r11"); + register unsigned long r10 asm("%r10"); + register unsigned long r8 asm("%r8"); + register unsigned long r9 asm("%r9"); + register void *rsp asm("%rsp"); + unsigned long rcx, rbx; + + r11 = r11_sentinel; + rcx = rcx_sentinel; + r10 = arg4; + r8 = arg5; + r9 = arg6; + + asm volatile ( + "pushq %[rflags_sentinel]\n\t" + "popf\n\t" + "leaq 1f(%%rip), %[rbx]\n\t" + "syscall\n" + "1:" + + : "+a" (nr_syscall), + "+r" (r11), + "+c" (rcx), + [rbx] "=b" (rbx), + "+r" (rsp) /* Clobber the redzone */ + + : [rflags_sentinel] "g" (rflags_sentinel), + "D" (arg1), /* %rdi */ + "S" (arg2), /* %rsi */ + "d" (arg3), /* %rdx */ + "r" (r10), + "r" (r8), + "r" (r9) + + : "memory" + ); + + check_regs_result(r11, rcx, rbx); + return nr_syscall; +} + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -88,24 +186,26 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); + /* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip; - - /* R11 and EFLAGS should already match. */ - assert(ctx->uc_mcontext.gregs[REG_EFL] == - ctx->uc_mcontext.gregs[REG_R11]); - sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); +}
- return; +static void __raise(int sig) +{ + do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0); }
static void test_sigreturn_to(unsigned long ip) { rip = ip; printf("[RUN]\tsigreturn to 0x%lx\n", ip); - raise(SIGUSR1); + __raise(SIGUSR1); }
static jmp_buf jmpbuf;
From: Ammar Faizi ammarfaizi2@gnuweeb.org
Test that:
- REGS_SAVED: "syscall" in a FRED system doesn't clobber %rcx and %r11.
- REGS_SYSRET: "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
Test them out with trivial system calls like __NR_getppid and friends which are extremely likely to return with SYSRET on an IDT system.
Goals of this test:
- Ensure that the syscall behavior is consistent. It must be either always REGS_SAVED or always REGS_SYSRET. Not a mix of them.
- The kernel doesn't leak its internal data when returning to userspace.
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com Co-developed-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org --- tools/testing/selftests/x86/sysret_rip.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index ef3f492d95f6f2a1..d688cb9d5ac919eb 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -258,8 +258,24 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); }
+/* + * Ensure that various system calls are consistent. + * We must not get a mix of REGS_SAVED and REGS_SYSRET. + */ +static void test_syscall_rcx_r11_consistent(void) +{ + do_syscall(__NR_getpid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_gettid, 0, 0, 0, 0, 0, 0); + do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); +} + int main() { + int i; + + for (i = 0; i < 32; i++) + test_syscall_rcx_r11_consistent(); + /* * When the kernel returns from a slow-path syscall, it will * detect whether SYSRET is appropriate. If it incorrectly @@ -267,7 +283,7 @@ int main() * it'll crash on Intel CPUs. */ sethandler(SIGUSR1, sigusr1, 0); - for (int i = 47; i < 64; i++) + for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
clearhandler(SIGUSR1); @@ -278,7 +294,7 @@ int main() test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
/* These are the interesting cases. */ - for (int i = 47; i < 64; i++) { + for (i = 47; i < 64; i++) { test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE); test_syscall_fallthrough_to(1UL<<i); }
From: Ammar Faizi ammarfaizi2@gnuweeb.org
When run on a non-FRED system, test the SYSRET path. Make sure the %rcx/%r11 clobbering behavior is consistent.
When run on a FRED system, test that %rcx/%r11 are preserved when invoking syscall.
This is similar to what test_syscall_rcx_r11_consistent() is doing, but with addition it's done via the SIGUSR2 signal handler.
Link: https://lore.kernel.org/lkml/8770815f-0f23-d0c5-e56a-d401827842c9@zytor.com Suggested-by: H. Peter Anvin (Intel) hpa@zytor.com Signed-off-by: Ammar Faizi ammarfaizi2@gnuweeb.org ---
On Wed, 25 Jan 2023 00:39:26 -0800, "H. Peter Anvin" wrote:
/* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip;
It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2...
tools/testing/selftests/x86/sysret_rip.c | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index d688cb9d5ac919eb..630ee261559c14b1 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -269,6 +269,28 @@ static void test_syscall_rcx_r11_consistent(void) do_syscall(__NR_getppid, 0, 0, 0, 0, 0, 0); }
+static unsigned long usr2_rcx; +static unsigned long usr2_r11; + +static void sigusr2(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + usr2_r11 = ctx->uc_mcontext.gregs[REG_R11]; + usr2_rcx = ctx->uc_mcontext.gregs[REG_RCX]; + + check_regs_result(ctx->uc_mcontext.gregs[REG_R11], + ctx->uc_mcontext.gregs[REG_RCX], + ctx->uc_mcontext.gregs[REG_RBX]); +} + +static void test_sysret_consistent(void) +{ + printf("[RUN]\ttest_sysret_consistent\n"); + __raise(SIGUSR2); + printf("[OK]\tRCX = %#lx; R11 = %#lx\n", usr2_rcx, usr2_r11); +} + int main() { int i; @@ -286,6 +308,21 @@ int main() for (i = 47; i < 64; i++) test_sigreturn_to(1UL<<i);
+ /* + * + * When run on a non-FRED system, test the SYSRET path. Make + * sure the %rcx/%r11 clobbering behavior is consistent. + * + * When run on a FRED system, test that %rcx/%r11 are preserved + * when invoking syscall. + * + * This is similar to test_syscall_rcx_r11_consistent(), but via + * a signal handler. + */ + sethandler(SIGUSR2, sigusr2, 0); + for (i = 0; i < 32; i++) + test_sysret_consistent(); + clearhandler(SIGUSR1);
sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
linux-kselftest-mirror@lists.linaro.org