So the first 3 patches are cleanups and fixes and should probably just go in.
The last patch however is the one Linus hates, it converts i386 to always have a complete pt_regs. Both Josh and me did a bunch of cleanups and the patch is now a net reduction in lines.
I still think it is the right thing to do; it removes a whole lot of magic from the C code (and avoids future surprises), but if this cannot convince Linus, I'll not pursue this further.
The code flow around the return from interrupt preemption point seems needlesly complicated.
There is only one site jumping to resume_kernel, and none (outside of resume_kernel) jumping to restore_all_kernel. Inline resume_kernel in restore_all_kernel and avoid the CONFIG_PREEMPT dependent label.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org --- arch/x86/entry/entry_32.S | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-)
--- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,7 +67,6 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif
.macro TRACE_IRQS_IRET @@ -755,7 +754,7 @@ END(ret_from_fork) andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -765,18 +764,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception)
-#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp restore_all_kernel -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1027,6 +1014,15 @@ ENTRY(entry_INT80_32) INTERRUPT_RETURN
restore_all_kernel: +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lno_preempt + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lno_preempt + call preempt_schedule_irq +.Lno_preempt: +#endif TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3
The kprobe trampolines have a FRAME_POINTER annotation that makes no sense. It marks the frame in the middle of pt_regs, at the place of saving BP.
Change it to mark the pt_regs frame as per the ENCODE_FRAME_POINTER from the respective entry_*.S.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org --- arch/x86/kernel/kprobes/common.h | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-)
--- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,14 +6,15 @@
#include <asm/asm.h>
+#ifdef CONFIG_X86_64 + #ifdef CONFIG_FRAME_POINTER -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ - " mov %" _ASM_SP ", %" _ASM_BP "\n" +#define ENCODE_FRAME_POINTER \ + " leaq 1(%rsp), %rbp\n" #else -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#define ENCODE_FRAME_POINTER #endif
-#ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ " subq $24, %rsp\n" \ @@ -27,11 +28,13 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - SAVE_RBP_STRING \ + " pushq %rbp\n" \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ - " pushq %r15\n" + " pushq %r15\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popq %r15\n" \ " popq %r14\n" \ @@ -51,19 +54,30 @@ /* Skip orig_ax, ip, cs */ \ " addq $24, %rsp\n" #else + +#ifdef CONFIG_FRAME_POINTER +#define ENCODE_FRAME_POINTER \ + " movl %esp, %ebp\n" \ + " andl $0x7fffffff, %ebp\n" +#else +#define ENCODE_FRAME_POINTER +#endif + #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax and gs. */ \ - " subl $16, %esp\n" \ + " subl $4*4, %esp\n" \ " pushl %fs\n" \ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - SAVE_RBP_STRING \ + " pushl %ebp\n" \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ " pushl %ecx\n" \ - " pushl %ebx\n" + " pushl %ebx\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popl %ebx\n" \ " popl %ecx\n" \
On Wed, May 08, 2019 at 09:49:03AM +0200, Peter Zijlstra wrote:
The kprobe trampolines have a FRAME_POINTER annotation that makes no sense. It marks the frame in the middle of pt_regs, at the place of saving BP.
Change it to mark the pt_regs frame as per the ENCODE_FRAME_POINTER from the respective entry_*.S.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org
arch/x86/kernel/kprobes/common.h | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-)
--- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,14 +6,15 @@ #include <asm/asm.h> +#ifdef CONFIG_X86_64
#ifdef CONFIG_FRAME_POINTER -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
" mov %" _ASM_SP ", %" _ASM_BP "\n"
+#define ENCODE_FRAME_POINTER \
- " leaq 1(%rsp), %rbp\n"
#else -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#define ENCODE_FRAME_POINTER #endif
+#ifdef CONFIG_FRAME_POINTER +#define ENCODE_FRAME_POINTER \
- " movl %esp, %ebp\n" \
- " andl $0x7fffffff, %ebp\n"
+#else +#define ENCODE_FRAME_POINTER +#endif
We should put these macros in a header file somewhere (including stringified versions).
On Wed, May 08, 2019 at 06:54:16AM -0500, Josh Poimboeuf wrote:
On Wed, May 08, 2019 at 09:49:03AM +0200, Peter Zijlstra wrote:
The kprobe trampolines have a FRAME_POINTER annotation that makes no sense. It marks the frame in the middle of pt_regs, at the place of saving BP.
Change it to mark the pt_regs frame as per the ENCODE_FRAME_POINTER from the respective entry_*.S.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org
arch/x86/kernel/kprobes/common.h | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-)
--- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,14 +6,15 @@ #include <asm/asm.h> +#ifdef CONFIG_X86_64
#ifdef CONFIG_FRAME_POINTER -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
" mov %" _ASM_SP ", %" _ASM_BP "\n"
+#define ENCODE_FRAME_POINTER \
- " leaq 1(%rsp), %rbp\n"
#else -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#define ENCODE_FRAME_POINTER #endif
+#ifdef CONFIG_FRAME_POINTER +#define ENCODE_FRAME_POINTER \
- " movl %esp, %ebp\n" \
- " andl $0x7fffffff, %ebp\n"
+#else +#define ENCODE_FRAME_POINTER +#endif
We should put these macros in a header file somewhere (including stringified versions).
Probably a good idea. I'll frob them into asm/frame.h.
Do the x86_64 variants also want some ORC annotation?
When CONFIG_FRAME_POINTER, we should mark pt_regs frames.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org --- arch/x86/kernel/ftrace_32.S | 6 ++++++ arch/x86/kernel/ftrace_64.S | 3 +++ 2 files changed, 9 insertions(+)
--- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -138,6 +138,12 @@ ENTRY(ftrace_regs_caller) #else movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ #endif + +#ifdef CONFIG_FRAME_POINTER + movl %esp, %ebp + andl $0x7fffffff, %ebp +#endif + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ pushl %esp /* Save pt_regs as 4th parameter */
--- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -222,6 +222,9 @@ GLOBAL(ftrace_regs_caller_op_ptr) leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp)
+#ifdef CONFIG_FRAME_POINTER + leaq 1(%rsp), %rbp +#endif /* regs go into 4th parameter */ leaq (%rsp), %rcx
Currently pt_regs on x86_32 has an oddity in that kernel regs (!user_mode(regs)) are short two entries (esp/ss). This means that any code trying to use them (typically: regs->sp) needs to jump through some unfortunate hoops.
Change the entry code to fix this up and create a full pt_regs frame.
This then simplifies:
- ftrace - kprobes - stack unwinder - ptrace - kdump - kgdb
Hated-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org --- arch/x86/entry/entry_32.S | 105 ++++++++++++++++++++++++++++++++++---- arch/x86/include/asm/kexec.h | 17 ------ arch/x86/include/asm/ptrace.h | 17 ------ arch/x86/include/asm/stacktrace.h | 2 arch/x86/kernel/crash.c | 8 -- arch/x86/kernel/ftrace_32.S | 81 ++++++++++++++++------------- arch/x86/kernel/kgdb.c | 8 -- arch/x86/kernel/kprobes/common.h | 4 - arch/x86/kernel/kprobes/core.c | 29 ++++------ arch/x86/kernel/kprobes/opt.c | 20 ++++--- arch/x86/kernel/process_32.c | 16 +---- arch/x86/kernel/ptrace.c | 29 ---------- arch/x86/kernel/time.c | 3 - arch/x86/kernel/unwind_frame.c | 32 +---------- arch/x86/kernel/unwind_orc.c | 2 15 files changed, 181 insertions(+), 192 deletions(-)
--- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -202,9 +202,102 @@ .Lend_@: .endm
+#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_KERNEL (1 << 29) + +.macro FIXUP_FRAME + /* + * The high bits of the CS dword (__csh) are used for CS_FROM_*. + * Clear them in case hardware didn't do this for us. + */ + andl $0x0000ffff, 3*4(%esp) + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 4*4(%esp) + jnz .Lfrom_usermode_no_fixup_@ +#endif + testl $SEGMENT_RPL_MASK, 3*4(%esp) + jnz .Lfrom_usermode_no_fixup_@ + + orl $CS_FROM_KERNEL, 3*4(%esp) + + /* + * When we're here from kernel mode; the (exception) stack looks like: + * + * 5*4(%esp) - <previous context> + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + * + * Lets build a 5 entry IRET frame after that, such that struct pt_regs + * is complete and in particular regs->sp is correct. This gives us + * the original 5 enties as gap: + * + * 12*4(%esp) - <previous context> + * 11*4(%esp) - gap / flags + * 10*4(%esp) - gap / cs + * 9*4(%esp) - gap / ip + * 8*4(%esp) - gap / orig_eax + * 7*4(%esp) - gap / gs / function + * 6*4(%esp) - ss + * 5*4(%esp) - sp + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + */ + + pushl %ss # ss + pushl %esp # sp (points at ss) + addl $6*4, (%esp) # point sp back at the previous context + pushl 6*4(%esp) # flags + pushl 6*4(%esp) # cs + pushl 6*4(%esp) # ip + pushl 6*4(%esp) # orig_eax + pushl 6*4(%esp) # gs / function +.Lfrom_usermode_no_fixup_@: +.endm + +.macro IRET_FRAME + testl $CS_FROM_KERNEL, 1*4(%esp) + jz .Lfinished_frame_@ + + /* + * Reconstruct the 3 entry IRET frame right after the (modified) + * regs->sp without lowering %esp in between, such that an NMI in the + * middle doesn't scribble our stack. + */ + pushl %eax + pushl %ecx + movl 5*4(%esp), %eax # (modified) regs->sp + + movl 4*4(%esp), %ecx # flags + movl %ecx, -4(%eax) + + movl 3*4(%esp), %ecx # cs + andl $0x0000ffff, %ecx + movl %ecx, -8(%eax) + + movl 2*4(%esp), %ecx # ip + movl %ecx, -12(%eax) + + movl 1*4(%esp), %ecx # eax + movl %ecx, -16(%eax) + + popl %ecx + lea -16(%eax), %esp + popl %eax +.Lfinished_frame_@: +.endm + .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS + FIXUP_FRAME pushl %fs pushl %es pushl %ds @@ -374,9 +467,6 @@ * switch to it before we do any copying. */
-#define CS_FROM_ENTRY_STACK (1 << 31) -#define CS_FROM_USER_CR3 (1 << 30) - .macro SWITCH_TO_KERNEL_STACK
ALTERNATIVE "", "jmp .Lend_@", X86_FEATURE_XENPV @@ -390,13 +480,6 @@ * that register for the time this macro runs */
- /* - * The high bits of the CS dword (__csh) are used for - * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case - * hardware didn't do this for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -1006,6 +1089,7 @@ ENTRY(entry_INT80_32) /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: + IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1356,6 +1440,7 @@ END(page_fault)
common_exception: /* the function address is in %gs's slot on the stack */ + FIXUP_FRAME pushl %fs pushl %es pushl %ds --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -71,22 +71,6 @@ struct kimage; #define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/* - * CPU does not save ss and sp on stack if execution is already - * running in kernel mode at the time of NMI occurrence. This code - * fixes it. - */ -static inline void crash_fixup_ss_esp(struct pt_regs *newregs, - struct pt_regs *oldregs) -{ -#ifdef CONFIG_X86_32 - newregs->sp = (unsigned long)&(oldregs->sp); - asm volatile("xorl %%eax, %%eax\n\t" - "movw %%ss, %%ax\n\t" - :"=a"(newregs->ss)); -#endif -} - -/* * This function is responsible for capturing register states if coming * via panic otherwise just fix up the ss and sp if coming via kernel * mode exception. @@ -96,7 +80,6 @@ static inline void crash_setup_regs(stru { if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); - crash_fixup_ss_esp(newregs, oldregs); } else { #ifdef CONFIG_X86_32 asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -166,14 +166,10 @@ static inline bool user_64bit_mode(struc #define compat_user_stack_pointer() current_pt_regs()->sp #endif
-#ifdef CONFIG_X86_32 -extern unsigned long kernel_stack_pointer(struct pt_regs *regs); -#else static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -#endif
#define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -201,14 +197,6 @@ static inline unsigned long regs_get_reg if (unlikely(offset > MAX_REG_OFFSET)) return 0; #ifdef CONFIG_X86_32 - /* - * Traps from the kernel do not save sp and ss. - * Use the helper function to retrieve sp. - */ - if (offset == offsetof(struct pt_regs, sp) && - regs->cs == __KERNEL_CS) - return kernel_stack_pointer(regs); - /* The selector fields are 16-bit. */ if (offset == offsetof(struct pt_regs, cs) || offset == offsetof(struct pt_regs, ss) || @@ -234,8 +222,7 @@ static inline unsigned long regs_get_reg static inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { - return ((addr & ~(THREAD_SIZE - 1)) == - (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); + return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1))); }
/** @@ -249,7 +236,7 @@ static inline int regs_within_kernel_sta */ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + unsigned long *addr = (unsigned long *)regs->sp;
addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,7 +78,7 @@ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) - return (unsigned long *)kernel_stack_pointer(regs); + return (unsigned long *)regs->sp;
if (task == current) return __builtin_frame_address(0); --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -72,14 +72,6 @@ static inline void cpu_crash_vmclear_loa
static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { -#ifdef CONFIG_X86_32 - struct pt_regs fixed_regs; - - if (!user_mode(regs)) { - crash_fixup_ss_esp(&fixed_regs, regs); - regs = &fixed_regs; - } -#endif crash_save_cpu(regs, cpu);
/* --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -9,6 +9,7 @@ #include <asm/export.h> #include <asm/ftrace.h> #include <asm/nospec-branch.h> +#include <asm/asm-offsets.h>
#ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -104,26 +105,38 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller) /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * regs->ip location, and move flags into the return ip location. + * We're here from an mcount/fentry CALL, and the stack frame looks like: + * + * <previous context> + * RET-IP + * + * The purpose of this function is to call out in an emulated INT3 + * environment with a stack frame like: + * + * <previous context> + * gap / RET-IP + * gap + * gap + * gap + * pt_regs + * + * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds */ - pushl $__KERNEL_CS - pushl 4(%esp) /* Save the return ip */ - pushl $0 /* Load 0 into orig_ax */ + subl $3*4, %esp # RET-IP + 3 gaps + pushl %ss # ss + pushl %esp # points at ss + addl $5*4, (%esp) # make it point at <previous context> + pushfl # flags + pushl $__KERNEL_CS # cs + pushl 7*4(%esp) # ip <- RET-IP + pushl $0 # orig_eax + pushl %gs pushl %fs pushl %es pushl %ds - pushl %eax - - /* Get flags and place them into the return ip slot */ - pushf - popl %eax - movl %eax, 8*4(%esp)
+ pushl %eax pushl %ebp pushl %edi pushl %esi @@ -131,12 +144,13 @@ ENTRY(ftrace_regs_caller) pushl %ecx pushl %ebx
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl PT_EIP(%esp), %eax # 1st argument: IP + subl $MCOUNT_INSN_SIZE, %eax + #ifdef CC_USING_FENTRY - movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ + movl 21*4(%esp), %edx # 2nd argument: parent ip #else - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl 1*4(%ebp), %edx # 2nd argument: parent ip #endif
#ifdef CONFIG_FRAME_POINTER @@ -144,21 +158,22 @@ ENTRY(ftrace_regs_caller) andl $0x7fffffff, %ebp #endif
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ + movl function_trace_op, %ecx # 3rd argument: ftrace_pos + pushl %esp # 4th argument: pt_regs
GLOBAL(ftrace_regs_call) call ftrace_stub
- addl $4, %esp /* Skip pt_regs */ + addl $4, %esp # skip 4th argument
- /* restore flags */ - push 14*4(%esp) - popf - - /* Move return ip back to its original location */ - movl 12*4(%esp), %eax - movl %eax, 14*4(%esp) + /* place IP below the new SP */ + movl PT_OLDESP(%esp), %eax + movl PT_EIP(%esp), %ecx + movl %ecx, -4(%eax) + + /* place EAX below that */ + movl PT_EAX(%esp), %ecx + movl %ecx, -8(%eax)
popl %ebx popl %ecx @@ -166,16 +181,12 @@ GLOBAL(ftrace_regs_call) popl %esi popl %edi popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs
- /* use lea to not affect flags */ - lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + lea -8(%eax), %esp + popl %eax
jmp .Lftrace_ret + #else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook) --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -127,14 +127,6 @@ char *dbg_get_reg(int regno, void *mem,
#ifdef CONFIG_X86_32 switch (regno) { - case GDB_SS: - if (!user_mode(regs)) - *(unsigned long *)mem = __KERNEL_DS; - break; - case GDB_SP: - if (!user_mode(regs)) - *(unsigned long *)mem = kernel_stack_pointer(regs); - break; case GDB_GS: case GDB_FS: *(unsigned long *)mem = 0xFFFF; --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -86,8 +86,8 @@ " popl %edi\n" \ " popl %ebp\n" \ " popl %eax\n" \ - /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ - " addl $24, %esp\n" + /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\ + " addl $7*4, %esp\n" #endif
/* Ensure if the instruction can be boostable */ --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -69,7 +69,7 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) +#define stack_addr(regs) ((unsigned long *)regs->sp)
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -731,29 +731,27 @@ asm( ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 /* We don't bother saving the ss register */ +#ifdef CONFIG_X86_64 " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ - " movq %rax, 152(%rsp)\n" + " movq %rax, 19*8(%rsp)\n" RESTORE_REGS_STRING " popfq\n" #else - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" - /* Move flags to cs */ - " movl 56(%esp), %edx\n" - " movl %edx, 52(%esp)\n" - /* Replace saved flags with true return address. */ - " movl %eax, 56(%esp)\n" + /* Replace saved sp with true return address. */ + " movl %eax, 15*4(%esp)\n" RESTORE_REGS_STRING - " popf\n" + " popfl\n" #endif " ret\n" ".size kretprobe_trampoline, .-kretprobe_trampoline\n" @@ -794,16 +792,13 @@ static __used void *trampoline_handler(s INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; - /* On x86-64, we use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; - /* On x86-32, we use pt_regs->flags for return address holder. */ - frame_pointer = ®s->flags; #endif + /* We use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; regs->ip = trampoline_address; regs->orig_ax = ~0UL;
--- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -115,14 +115,15 @@ asm ( "optprobe_template_call:\n" ASM_NOP5 /* Move flags to rsp */ - " movq 144(%rsp), %rdx\n" - " movq %rdx, 152(%rsp)\n" + " movq 18*8(%rsp), %rdx\n" + " movq %rdx, 19*8(%rsp)\n" RESTORE_REGS_STRING /* Skip flags entry */ " addq $8, %rsp\n" " popfq\n" #else /* CONFIG_X86_32 */ - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -131,9 +132,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 + /* Move flags into esp */ + " movl 14*4(%esp), %edx\n" + " movl %edx, 15*4(%esp)\n" RESTORE_REGS_STRING - " addl $4, %esp\n" /* skip cs */ - " popf\n" + /* Skip flags entry */ + " addl $4, %esp\n" + " popfl\n" #endif ".global optprobe_template_end\n" "optprobe_template_end:\n" @@ -165,10 +170,9 @@ optimized_callback(struct optimized_kpro } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -62,27 +62,21 @@ void __show_regs(struct pt_regs *regs, e { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long sp; - unsigned short ss, gs; + unsigned short gs;
- if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss; + if (user_mode(regs)) gs = get_user_gs(regs); - } else { - sp = kernel_stack_pointer(regs); - savesegment(ss, ss); + else savesegment(gs, gs); - }
show_ip(regs, KERN_DEFAULT);
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, sp); + regs->si, regs->di, regs->bp, regs->sp); printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
if (mode != SHOW_REGS_ALL) return; --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -153,35 +153,6 @@ static inline bool invalid_selector(u16
#define FLAG_MASK FLAG_MASK_32
-/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * Now, if the stack is empty, '®s->sp' is out of range. In this - * case we try to take the previous stack. To always return a non-null - * stack pointer we fall back to regs as stack if no previous stack - * exists. - * - * This is valid only for kernel mode traps. - */ -unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); - unsigned long sp = (unsigned long)®s->sp; - u32 *prev_esp; - - if (context == (sp & ~(THREAD_SIZE - 1))) - return sp; - - prev_esp = (u32 *)(context); - if (*prev_esp) - return (unsigned long)*prev_esp; - - return (unsigned long)regs; -} -EXPORT_SYMBOL_GPL(kernel_stack_pointer); - static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -37,8 +37,7 @@ unsigned long profile_pc(struct pt_regs #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = - (unsigned long *)kernel_stack_pointer(regs); + unsigned long *sp = (unsigned long *)regs->sp; /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -69,15 +69,6 @@ static void unwind_dump(struct unwind_st } }
-static size_t regs_size(struct pt_regs *regs) -{ - /* x86_32 regs from kernel mode are two words shorter: */ - if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) - return sizeof(*regs) - 2*sizeof(long); - - return sizeof(*regs); -} - static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; @@ -197,12 +188,6 @@ static struct pt_regs *decode_frame_poin } #endif
-#ifdef CONFIG_X86_32 -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) -#else -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) -#endif - static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -213,7 +198,7 @@ static bool update_stack_state(struct un size_t len;
if (state->regs) - prev_frame_end = (void *)state->regs + regs_size(state->regs); + prev_frame_end = (void *)state->regs + sizeof(*state->regs); else prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
@@ -221,7 +206,7 @@ static bool update_stack_state(struct un regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = KERNEL_REGS_SIZE; + len = sizeof(*regs); state->got_irq = true; } else { frame = next_bp; @@ -245,14 +230,6 @@ static bool update_stack_state(struct un frame < prev_frame_end) return false;
- /* - * On 32-bit with user mode regs, make sure the last two regs are safe - * to access: - */ - if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && - !on_stack(info, frame, len + 2*sizeof(long))) - return false; - /* Move state to the next frame: */ if (regs) { state->regs = regs; @@ -411,10 +388,9 @@ void __unwind_start(struct unwind_state * Pretend that the frame is complete and that BP points to it, but save * the real BP so that we can use it when looking for the next frame. */ - if (regs && regs->ip == 0 && - (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) { state->next_bp = bp; - bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + bp = ((unsigned long *)regs->sp) - 1; }
/* Initialize stack info and make sure the frame data is accessible: */ --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -579,7 +579,7 @@ void __unwind_start(struct unwind_state goto done;
state->ip = regs->ip; - state->sp = kernel_stack_pointer(regs); + state->sp = regs->sp; state->bp = regs->bp; state->regs = regs; state->full_regs = true;
On Wed, May 08, 2019 at 09:49:05AM +0200, Peter Zijlstra wrote:
Currently pt_regs on x86_32 has an oddity in that kernel regs (!user_mode(regs)) are short two entries (esp/ss). This means that any code trying to use them (typically: regs->sp) needs to jump through some unfortunate hoops.
Change the entry code to fix this up and create a full pt_regs frame.
This then simplifies:
- ftrace
- kprobes
- stack unwinder
- ptrace
- kdump
- kgdb
Hated-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org
arch/x86/entry/entry_32.S | 105 ++++++++++++++++++++++++++++++++++---- arch/x86/include/asm/kexec.h | 17 ------ arch/x86/include/asm/ptrace.h | 17 ------ arch/x86/include/asm/stacktrace.h | 2 arch/x86/kernel/crash.c | 8 -- arch/x86/kernel/ftrace_32.S | 81 ++++++++++++++++------------- arch/x86/kernel/kgdb.c | 8 -- arch/x86/kernel/kprobes/common.h | 4 - arch/x86/kernel/kprobes/core.c | 29 ++++------ arch/x86/kernel/kprobes/opt.c | 20 ++++--- arch/x86/kernel/process_32.c | 16 +---- arch/x86/kernel/ptrace.c | 29 ---------- arch/x86/kernel/time.c | 3 - arch/x86/kernel/unwind_frame.c | 32 +---------- arch/x86/kernel/unwind_orc.c | 2 15 files changed, 181 insertions(+), 192 deletions(-)
Very nice diffstat. This moves all the pain to the 32-bit entry code where it belongs.
On Wed, May 8, 2019 at 1:12 AM Peter Zijlstra peterz@infradead.org wrote:
Hated-by: Linus Torvalds torvalds@linux-foundation.org
I can live with this cleaned-up version. I'm still not loving it, but at least it now tries very hard to simplify other code. You can remote the hated-by.
Linus
linux-kselftest-mirror@lists.linaro.org