On Wed, Feb 22, 2023, Chang S. Bae wrote:
On 2/22/2023 10:40 AM, Mingwei Zhang wrote:
We have this [1]:
if (fpu_state_size_dynamic()) mask &= (header.xfeatures | xinit->header.xcomp_bv);If header.xfeatures[18] = 0 then mask[18] = 0 because xinit->header.xcomp_bv[18] = 0. Then, it won't hit that code. So, I'm confused about the problem that you described here.
Read the suggested changelog I wrote in my reply to Mingwei.
TLDR:
xsave.header.xfeatures[18] = 1 xinit.header.xfeatures[18] = 0 -> mask[18] = 1 -> __raw_xsave_addr(xsave, 18) <- Success -> __raw_xsave_addr(xinit, 18) <- WARNOh, sigh.. This should be caught last time.
Hmm, then since we store init state for legacy ones [1], unless it is too aggressive, perhaps the loop can be simplified like this:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 714166cc25f2..2dac6f5f3ade 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1118,21 +1118,13 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, zerofrom = offsetof(struct xregs_state, extended_state_area);
/*
* The ptrace buffer is in non-compacted XSAVE format. In* non-compacted format disabled features still occupy state space,* but there is no state to copy from in the compacted* init_fpstate. The gap tracking will zero these states.
* Indicate which states to copy from fpstate. When not present in* fpstate, those extended states are either initialized or* disabled. They are also known to have an all zeros init state.* Thus, remove them from 'mask' to zero those features in the user* buffer instead of retrieving them from init_fpstate. */
mask = fpstate->user_xfeatures;
Do we need to change this line and the comments? I don't see any of these was relevant to this issue. The original code semantic is to traverse all user_xfeatures, if it is available in fpstate, copy it from there; otherwise, copy it from init_fpstate. We do not assume the component in init_fpstate (but not in fpstate) are all zeros, do we? If it is safe to assume that, then it might be ok. But at least in this patch, I want to keep the original semantics as is without the assumption.
/** Dynamic features are not present in init_fpstate. When they are* in an all zeros init state, remove those from 'mask' to zero* those features in the user buffer instead of retrieving them* from init_fpstate.*/if (fpu_state_size_dynamic())mask &= (header.xfeatures | xinit->header.xcomp_bv);
mask = header.xfeatures;
Same here. Let's not adding this optimization in this patch.
for_each_extended_xfeature(i, mask) { /*@@ -1151,9 +1143,8 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else {
copy_feature(header.xfeatures & BIT_ULL(i), &to,
membuf_write(&to, __raw_xsave_addr(xsave, i),
__raw_xsave_addr(xinit, i), xstate_sizes[i]); } /*Chang: to reproduce this issue, you can simply run the amx_test in the kvm selftest directory.
Yeah, I was able to reproduce it with this ptrace test:
diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index 625e42901237..ae02bc81846d 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -14,8 +14,10 @@ #include <sys/auxv.h> #include <sys/mman.h> #include <sys/shm.h> +#include <sys/ptrace.h> #include <sys/syscall.h> #include <sys/wait.h> +#include <sys/uio.h>
#include "../kselftest.h" /* For __cpuid_count() */
@@ -826,6 +828,76 @@ static void test_context_switch(void) free(finfo); }
+/* Ptrace test */
+static bool inject_tiledata(pid_t target) +{
struct xsave_buffer *xbuf;struct iovec iov;xbuf = alloc_xbuf();if (!xbuf)fatal_error("unable to allocate XSAVE buffer");load_rand_tiledata(xbuf);memcpy(&stashed_xsave->bytes[xtiledata.xbuf_offset],&xbuf->bytes[xtiledata.xbuf_offset],xtiledata.size);iov.iov_base = xbuf;iov.iov_len = xbuf_size;if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov))fatal_error("PTRACE_SETREGSET");if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov))err(1, "PTRACE_GETREGSET");if (!memcmp(&stashed_xsave->bytes[xtiledata.xbuf_offset],&xbuf->bytes[xtiledata.xbuf_offset],xtiledata.size))return true;elsereturn false;+}
+static void test_ptrace(void) +{
pid_t child;int status;child = fork();if (child < 0) {err(1, "fork");} else if (!child) {if (ptrace(PTRACE_TRACEME, 0, NULL, NULL))err(1, "PTRACE_TRACEME");/* Use the state to expand the kernel buffer */load_rand_tiledata(stashed_xsave);raise(SIGTRAP);_exit(0);}do {wait(&status);} while (WSTOPSIG(status) != SIGTRAP);printf("\tInject tile data via ptrace()\n");if (inject_tiledata(child))printf("[OK]\tTile data was written on ptracee.\n");elseprintf("[FAIL]\tTile data was not written on ptracee.\n");ptrace(PTRACE_DETACH, child, NULL, NULL);wait(&status);if (!WIFEXITED(status) || WEXITSTATUS(status))err(1, "ptrace test");+}
int main(void) { /* Check hardware availability at first */ @@ -846,6 +918,8 @@ int main(void) ctxtswtest_config.num_threads = 5; test_context_switch();
test_ptrace();clearhandler(SIGILL); free_stashed_xsave();Thanks, Chang
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch...
Nice one. Yeah both ptrace and KVM are calling this function so the above code would also be enough to trigger the bug.
Thanks. -Mingwei