On Fri, May 03, 2019 at 11:57:22AM -0700, Linus Torvalds wrote:
On Fri, May 3, 2019 at 9:21 AM Andy Lutomirski luto@amacapital.net wrote:
So here’s a somewhat nutty suggestion: how about we tweak the 32-bit entry code to emulate the sane 64-bit frame, not just for int3 but always?
What would the code actually end up looking like? I don't necessarily object, since that kernel_stack_pointer() thing certainly looks horrible, but honestly, my suggestion to just pass in the 'struct pt_regs' and let the call emulation fix it up would have also worked, and avoided that bug (and who knows what else might be hiding).
I really think that you're now hitting all the special case magic low-level crap that I wanted to avoid.
This did actually boot on first try; so there must be something horribly wrong...
Now, I know you like that other approach; but I figured I should at least show you what this one looks like. Maybe I've been staring at entry_32.S too much, but I really don't dislike this.
--- arch/x86/entry/entry_32.S | 150 +++++++++++++++++++++++++++++------ arch/x86/entry/entry_64.S | 14 +++- arch/x86/include/asm/ptrace.h | 4 - arch/x86/include/asm/text-patching.h | 20 +++++ arch/x86/kernel/alternative.c | 81 ++++++++++++++++++- arch/x86/kernel/ptrace.c | 29 ------- 6 files changed, 235 insertions(+), 63 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7b23431be5cb..d29cf03219c5 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,9 +67,20 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif
+.macro RETINT_PREEMPT +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lend_@ + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lend_@ + call preempt_schedule_irq +.Lend_@: +#endif +.endm + .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? @@ -203,8 +214,119 @@ .Lend_@: .endm
+#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_KERNEL (1 << 29) + +.macro FIXUP_FRAME + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 3*4(%esp) + jnz .Lfrom_usermode_no_fixup_@ +#endif + testl $SEGMENT_RPL_MASK, 2*4(%esp) + jnz .Lfrom_usermode_no_fixup_@ + + orl $CS_FROM_KERNEL, 2*4(%esp) + + /* + * When we're here from kernel mode; the (exception) stack looks like: + * + * 4*4(%esp) - <previous context> + * 3*4(%esp) - flags + * 2*4(%esp) - cs + * 1*4(%esp) - ip + * 0*4(%esp) - orig_eax + * + * Lets build a 5 entry IRET frame after that, such that struct pt_regs + * is complete and in particular regs->sp is correct. This gives us + * the original 4 enties as gap: + * + * 10*4(%esp) - <previous context> + * 9*4(%esp) - gap / flags + * 8*4(%esp) - gap / cs + * 7*4(%esp) - gap / ip + * 6*4(%esp) - gap / orig_eax + * 5*4(%esp) - ss + * 4*4(%esp) - sp + * 3*4(%esp) - flags + * 2*4(%esp) - cs + * 1*4(%esp) - ip + * 0*4(%esp) - orig_eax + */ + + pushl %ss # ss + pushl %esp # sp (points at ss) + add $5*4, (%esp) # point sp back at the previous context + pushl 5*4(%esp) # flags + pushl 5*4(%esp) # cs + pushl 5*4(%esp) # ip + pushl 5*4(%esp) # orig_eax + +.Lfrom_usermode_no_fixup_@: +.endm + +.macro IRET_FRAME + + /* orig_eax is already POP'ed when we're here */ + + testl $CS_FROM_KERNEL, 1*4(%esp) + jz .Lfinished_frame_@ + + pushl %eax + + lea 10*4(%esp), %eax # address of <previous context> + cmpl %eax, 4*4(%esp) # if ->sp is unmodified + jnz .Lmodified_sp_do_fixup_@ + + /* + * Fast path; regs->sp wasn't modified, reuse the original IRET frame. + */ + pop %eax + add $6*4, %esp + jmp .Lfinished_frame_@; + +.Lmodified_sp_do_fixup_@: + + /* + * Reconstruct the 3 entry IRET frame right after the (modified) + * regs->sp without lowering %esp in between, such that an NMI in the + * middle doesn't scribble our stack. + */ + pushl %ecx + movl 5*4(%esp), %eax # (modified) regs->sp + + movl 4*4(%esp), %ecx # flags + movl %ecx, -4(%eax) + + movl 3*4(%esp), %ecx # cs + andl $0x0000ffff, %ecx + movl %ecx, -8(%eax) + + movl 2*4(%esp), %ecx # ip + movl %ecx, -12(%eax) + + movl 1*4(%esp), %ecx # eax + movl %ecx, -16(%eax) + + popl %ecx + lea -16(%eax), %esp + popl %eax + +.Lfinished_frame_@: +.endm + .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 + cld + /* + * The high bits of the CS dword (__csh) are used for CS_FROM_*. + * Clear them in case hardware didn't do this for us. + */ + andl $(0x0000ffff), 2*4(%esp) + + FIXUP_FRAME + PUSH_GS pushl %fs pushl %es @@ -375,9 +497,6 @@ * switch to it before we do any copying. */
-#define CS_FROM_ENTRY_STACK (1 << 31) -#define CS_FROM_USER_CR3 (1 << 30) - .macro SWITCH_TO_KERNEL_STACK
ALTERNATIVE "", "jmp .Lend_@", X86_FEATURE_XENPV @@ -391,13 +510,6 @@ * that register for the time this macro runs */
- /* - * The high bits of the CS dword (__csh) are used for - * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case - * hardware didn't do this for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -755,7 +867,7 @@ END(ret_from_fork) andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -765,18 +877,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception)
-#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp restore_all_kernel -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1019,6 +1119,7 @@ ENTRY(entry_INT80_32) /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: + IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1027,6 +1128,7 @@ ENTRY(entry_INT80_32) INTERRUPT_RETURN
restore_all_kernel: + RETINT_PREEMPT TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 20e45d9b4e15..268cd9affe04 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -898,6 +898,16 @@ ENTRY(\sym) jnz .Lfrom_usermode_switch_stack_@ .endif
+ .if \create_gap == 1 + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_@: + .endif + .if \paranoid call paranoid_entry .else @@ -1129,7 +1139,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ #endif /* CONFIG_HYPERV */
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET -idtentry int3 do_int3 has_error_code=0 +idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN_PV diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8a7fc0cca2d1..5ff42dc8b396 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -166,14 +166,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #define compat_user_stack_pointer() current_pt_regs()->sp #endif
-#ifdef CONFIG_X86_32 -extern unsigned long kernel_stack_pointer(struct pt_regs *regs); -#else static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -#endif
#define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index c90678fd391a..6aac6abf931e 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -42,4 +42,24 @@ extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr;
+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) +{ + regs->sp -= sizeof(unsigned long); + *(unsigned long *)regs->sp = val; +} + +static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +#define INT3_INSN_SIZE 1 +#define CALL_INSN_SIZE 5 + +static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) +{ + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); + int3_emulate_jmp(regs, func); +} + #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4db9c0d29bc1..9519bc553d6d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -613,11 +613,83 @@ extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */
+/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ +static void __init int3_magic(unsigned int *ptr) +{ + *ptr = 1; +} + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,"aw"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : : __ASM_SEL_RAW(a, D) (&val) : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + void __init alternative_instructions(void) { - /* The patching is not fully atomic, so try to avoid local interruptions - that might execute the to be patched code. - Other CPUs are not running. */ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ stop_nmi();
/* @@ -642,10 +714,11 @@ void __init alternative_instructions(void) _text, _etext); }
- if (!uniproc_patched || num_possible_cpus() == 1) + if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); + } #endif
apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4b8ee05dd6ad..d13f892d2c47 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -153,35 +153,6 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
-/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * Now, if the stack is empty, '®s->sp' is out of range. In this - * case we try to take the previous stack. To always return a non-null - * stack pointer we fall back to regs as stack if no previous stack - * exists. - * - * This is valid only for kernel mode traps. - */ -unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); - unsigned long sp = (unsigned long)®s->sp; - u32 *prev_esp; - - if (context == (sp & ~(THREAD_SIZE - 1))) - return sp; - - prev_esp = (u32 *)(context); - if (*prev_esp) - return (unsigned long)*prev_esp; - - return (unsigned long)regs; -} -EXPORT_SYMBOL_GPL(kernel_stack_pointer); - static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);