Following kernel crash noticed on x86_64 while running selftests: user_events: ftrace_test running 6.6.0-rc7-next-20231026.
Reported-by: Linux Kernel Functional Testing lkft@linaro.org
kselftest: Running tests in user_events TAP version 13 1..4 # timeout set to 90 # selftests: user_events: ftrace_test [ 2391.606817] general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b8a83: 0000 [#1] PREEMPT SMP PTI [ 2391.617519] CPU: 1 PID: 34662 Comm: ftrace_test Not tainted 6.6.0-rc7-next-20231026 #1 [ 2391.625428] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.7 12/07/2021 [ 2391.632811] RIP: 0010:tracing_update_buffers (kernel/trace/trace.c:6470) [ 2391.637952] Code: 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 55 31 f6 48 89 e5 41 55 41 54 53 48 89 fb 48 c7 c7 40 8c 61 94 e8 92 d3 5a 01 <44> 0f b6 a3 18 1f 00 00 41 80 fc 01 0f 87 c8 dc 4e 01 45 31 ed 41 All code ======== 0: 90 nop 1: 90 nop 2: 90 nop 3: 90 nop 4: 90 nop 5: 90 nop 6: 90 nop 7: 90 nop 8: 90 nop 9: 90 nop a: 90 nop b: 90 nop c: 66 0f 1f 00 nopw (%rax) 10: 55 push %rbp 11: 31 f6 xor %esi,%esi 13: 48 89 e5 mov %rsp,%rbp 16: 41 55 push %r13 18: 41 54 push %r12 1a: 53 push %rbx 1b: 48 89 fb mov %rdi,%rbx 1e: 48 c7 c7 40 8c 61 94 mov $0xffffffff94618c40,%rdi 25: e8 92 d3 5a 01 callq 0x15ad3bc 2a:* 44 0f b6 a3 18 1f 00 movzbl 0x1f18(%rbx),%r12d <-- trapping instruction 31: 00 32: 41 80 fc 01 cmp $0x1,%r12b 36: 0f 87 c8 dc 4e 01 ja 0x14edd04 3c: 45 31 ed xor %r13d,%r13d 3f: 41 rex.B
Code starting with the faulting instruction =========================================== 0: 44 0f b6 a3 18 1f 00 movzbl 0x1f18(%rbx),%r12d 7: 00 8: 41 80 fc 01 cmp $0x1,%r12b c: 0f 87 c8 dc 4e 01 ja 0x14edcda 12: 45 31 ed xor %r13d,%r13d 15: 41 rex.B [ 2391.656696] RSP: 0018:ffffb36e0a477d80 EFLAGS: 00010246 [ 2391.661937] RAX: 0000000000000000 RBX: 6b6b6b6b6b6b6b6b RCX: 0000000080000000 [ 2391.669064] RDX: 0000000000000000 RSI: ffffffff9299b722 RDI: ffffffff9299b722 [ 2391.676195] RBP: ffffb36e0a477d98 R08: 000000000000002f R09: 0000000000000002 [ 2391.683321] R10: ffffb36e0a477d70 R11: 0000000000000000 R12: 0000000000000002 [ 2391.690453] R13: ffffb36e0a477e88 R14: ffff99c5803a2230 R15: ffff99c581c39000 [ 2391.697586] FS: 00007fb4b9681740(0000) GS:ffff99c6efa80000(0000) knlGS:0000000000000000 [ 2391.705670] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2391.711410] CR2: 00007fb4b96ab5e0 CR3: 000000010635c002 CR4: 00000000003706f0 [ 2391.718540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2391.725665] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2391.732797] Call Trace: [ 2391.735240] <TASK> [ 2391.737339] ? show_regs (arch/x86/kernel/dumpstack.c:479) [ 2391.740744] ? die_addr (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:460) [ 2391.744056] ? exc_general_protection (arch/x86/kernel/traps.c:697 arch/x86/kernel/traps.c:642) [ 2391.748766] ? asm_exc_general_protection (arch/x86/include/asm/idtentry.h:564) [ 2391.753652] ? __mutex_lock (kernel/locking/mutex.c:613 (discriminator 3) kernel/locking/mutex.c:747 (discriminator 3)) [ 2391.757487] ? __mutex_lock (kernel/locking/mutex.c:613 (discriminator 3) kernel/locking/mutex.c:747 (discriminator 3)) [ 2391.761318] ? tracing_update_buffers (kernel/trace/trace.c:6470) [ 2391.765851] event_enable_write (kernel/trace/trace_events.c:1408) [ 2391.769976] vfs_write (fs/read_write.c:582) [ 2391.773296] ? close_fd_get_file (fs/file.c:821) [ 2391.777396] ? preempt_count_sub (kernel/sched/core.c:5857 kernel/sched/core.c:5853 kernel/sched/core.c:5875) [ 2391.781496] ksys_write (fs/read_write.c:638) [ 2391.784918] __x64_sys_write (fs/read_write.c:646) [ 2391.788671] do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82) [ 2391.792248] ? do_syscall_64 (arch/x86/entry/common.c:101) [ 2391.795995] ? syscall_exit_to_user_mode (kernel/entry/common.c:299) [ 2391.800785] ? do_syscall_64 (arch/x86/entry/common.c:101) [ 2391.804529] ? do_syscall_64 (arch/x86/entry/common.c:101) [ 2391.808275] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) [ 2391.813327] RIP: 0033:0x7fb4b977c140 [ 2391.816920] Code: 40 00 48 8b 15 c1 9c 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d a1 24 0e 00 00 74 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89 All code ======== 0: 40 00 48 8b add %cl,-0x75(%rax) 4: 15 c1 9c 0d 00 adc $0xd9cc1,%eax 9: f7 d8 neg %eax b: 64 89 02 mov %eax,%fs:(%rdx) e: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax 15: eb b7 jmp 0xffffffffffffffce 17: 0f 1f 00 nopl (%rax) 1a: 80 3d a1 24 0e 00 00 cmpb $0x0,0xe24a1(%rip) # 0xe24c2 21: 74 17 je 0x3a 23: b8 01 00 00 00 mov $0x1,%eax 28: 0f 05 syscall 2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction 30: 77 58 ja 0x8a 32: c3 retq 33: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 3a: 48 83 ec 28 sub $0x28,%rsp 3e: 48 rex.W 3f: 89 .byte 0x89
Code starting with the faulting instruction =========================================== 0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax 6: 77 58 ja 0x60 8: c3 retq 9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 10: 48 83 ec 28 sub $0x28,%rsp 14: 48 rex.W 15: 89 .byte 0x89 [ 2391.835660] RSP: 002b:00007ffc43b05b38 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 2391.843225] RAX: ffffffffffffffda RBX: 00007ffc43b05d88 RCX: 00007fb4b977c140 [ 2391.850350] RDX: 0000000000000002 RSI: 000056376b59b7d4 RDI: 0000000000000007 [ 2391.857482] RBP: 00007ffc43b05b60 R08: 0000000000000000 R09: 00007fb4b9681740 [ 2391.864615] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 [ 2391.871747] R13: 00007ffc43b05d98 R14: 000056376b59ddc8 R15: 00007fb4b9981020 [ 2391.878907] </TASK> [ 2391.881106] Modules linked in: x86_pkg_temp_thermal fuse configfs [ 2391.887288] ---[ end trace 0000000000000000 ]--- [ 2391.891915] RIP: 0010:tracing_update_buffers (kernel/trace/trace.c:6470) [ 2391.897231] Code: 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 55 31 f6 48 89 e5 41 55 41 54 53 48 89 fb 48 c7 c7 40 8c 61 94 e8 92 d3 5a 01 <44> 0f b6 a3 18 1f 00 00 41 80 fc 01 0f 87 c8 dc 4e 01 45 31 ed 41 All code ======== 0: 90 nop 1: 90 nop 2: 90 nop 3: 90 nop 4: 90 nop 5: 90 nop 6: 90 nop 7: 90 nop 8: 90 nop 9: 90 nop a: 90 nop b: 90 nop c: 66 0f 1f 00 nopw (%rax) 10: 55 push %rbp 11: 31 f6 xor %esi,%esi 13: 48 89 e5 mov %rsp,%rbp 16: 41 55 push %r13 18: 41 54 push %r12 1a: 53 push %rbx 1b: 48 89 fb mov %rdi,%rbx 1e: 48 c7 c7 40 8c 61 94 mov $0xffffffff94618c40,%rdi 25: e8 92 d3 5a 01 callq 0x15ad3bc 2a:* 44 0f b6 a3 18 1f 00 movzbl 0x1f18(%rbx),%r12d <-- trapping instruction 31: 00 32: 41 80 fc 01 cmp $0x1,%r12b 36: 0f 87 c8 dc 4e 01 ja 0x14edd04 3c: 45 31 ed xor %r13d,%r13d 3f: 41 rex.B
Code starting with the faulting instruction =========================================== 0: 44 0f b6 a3 18 1f 00 movzbl 0x1f18(%rbx),%r12d 7: 00 8: 41 80 fc 01 cmp $0x1,%r12b c: 0f 87 c8 dc 4e 01 ja 0x14edcda 12: 45 31 ed xor %r13d,%r13d 15: 41 rex.B [ 2391.916120] RSP: 0018:ffffb36e0a477d80 EFLAGS: 00010246 [ 2391.921569] RAX: 0000000000000000 RBX: 6b6b6b6b6b6b6b6b RCX: 0000000080000000 [ 2391.928872] RDX: 0000000000000000 RSI: ffffffff9299b722 RDI: ffffffff9299b722 [ 2391.936237] RBP: ffffb36e0a477d98 R08: 000000000000002f R09: 0000000000000002 [ 2391.943388] R10: ffffb36e0a477d70 R11: 0000000000000000 R12: 0000000000000002 [ 2391.950527] R13: ffffb36e0a477e88 R14: ffff99c5803a2230 R15: ffff99c581c39000 [ 2391.957670] FS: 00007fb4b9681740(0000) GS:ffff99c6efa80000(0000) knlGS:0000000000000000 [ 2391.965822] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2391.971579] CR2: 00007fb4b96ab5e0 CR3: 000000010635c002 CR4: 00000000003706f0 [ 2391.978721] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2391.985879] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2391.993028] Kernel panic - not syncing: Fatal exception [ 2391.998287] Kernel Offset: 0x10000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 2392.009066] ---[ end Kernel panic - not syncing: Fatal exception ]---
Links: - https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20231026/tes... - https://lkft.validation.linaro.org/scheduler/job/6974179#L5053
metadata: git_ref: master git_repo: https://gitlab.com/Linaro/lkft/mirrors/next/linux-next git_sha: 2ef7141596eed0b4b45ef18b3626f428a6b0a822 git_describe: next-20231026 kernel_version: 6.6.0-rc7 kernel-config: https://storage.tuxsuite.com/public/linaro/lkft/builds/2XHt24sNSdog7DYY3FLKF... artifact-location: https://storage.tuxsuite.com/public/linaro/lkft/builds/2XHt24sNSdog7DYY3FLKF... toolchain: gcc-13
-- Linaro LKFT https://lkft.linaro.org
On Fri, Oct 27, 2023 at 05:38:41PM +0530, Naresh Kamboju wrote:
It's hard to repro this, it does repro after running several times for me.
Steven, this is firing when the self test writes 1 to the enable file for a user_event created event.
Sometimes I get this splat instead of a panic: [ 956.819778] ------------[ cut here ]------------ [ 956.820526] WARNING: CPU: 12 PID: 914 at kernel/trace/trace.c:516 tracing_release_file_tr+0x46/0x50 [ 956.821389] Modules linked in: [ 956.821708] CPU: 12 PID: 914 Comm: ftrace_test Not tainted 6.6.0-rc7-next-20231026 #1 [ 956.822501] RIP: 0010:tracing_release_file_tr+0x46/0x50 [ 956.822902] Code: d1 03 01 8b 83 c0 1e 00 00 85 c0 74 1d 83 e8 01 48 c7 c7 80 5b ef 84 89 83 c0 1e 00 00 e8 b2 b5 03 01 31 c0 5b e9 75 ee 27 01 <0f> 0b eb df 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 [ 956.824261] RSP: 0018:ffffb59d879e3ee8 EFLAGS: 00010246 [ 956.824656] RAX: 0000000000000000 RBX: ffff9e386a82c058 RCX: 0000000000000000 [ 956.825239] RDX: 0000000000000000 RSI: ffffffff84b6b383 RDI: 00000000ffffffff [ 956.825835] RBP: ffff9e3844076200 R08: 0000000000000000 R09: 0000000000000000 [ 956.826425] R10: ffffb59d879e3ee8 R11: ffffffff8328526f R12: 00000000000f801f [ 956.827359] R13: ffff9e3846134020 R14: ffff9e3843fd03a8 R15: 0000000000000000 [ 956.828732] FS: 00007eff4f818740(0000) GS:ffff9e3f26300000(0000) knlGS:0000000000000000 [ 956.829436] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 956.829968] CR2: 00007eff4f42a250 CR3: 00000007d0744004 CR4: 0000000000370eb0 [ 956.830591] Call Trace: [ 956.830804] <TASK> [ 956.831113] ? __warn+0x7f/0x160 [ 956.831469] ? tracing_release_file_tr+0x46/0x50 [ 956.831891] ? report_bug+0x1c3/0x1d0 [ 956.832224] ? handle_bug+0x3c/0x70 [ 956.832544] ? exc_invalid_op+0x14/0x70 [ 956.832861] ? asm_exc_invalid_op+0x16/0x20 [ 956.833190] ? tracing_release_file_tr+0x1f/0x50 [ 956.833727] ? tracing_release_file_tr+0x46/0x50 [ 956.834159] ? tracing_release_file_tr+0x1f/0x50 [ 956.834579] __fput+0xab/0x300 [ 956.834903] __x64_sys_close+0x38/0x80 [ 956.835235] do_syscall_64+0x41/0xf0 [ 956.835554] entry_SYSCALL_64_after_hwframe+0x6c/0x74 [ 956.835974] RIP: 0033:0x7eff4f515157 [ 956.836291] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 83 b8 f7 ff [ 956.837728] RSP: 002b:00007ffc021d8158 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 956.838351] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007eff4f515157 [ 956.838975] RDX: 0000000000000002 RSI: 000055637921d7d4 RDI: 0000000000000005 [ 956.839586] RBP: 00007ffc021d8180 R08: 0000000000000000 R09: 00007eff4f818740 [ 956.840201] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffc021d83b8 [ 956.840810] R13: 000055637921cae7 R14: 000055637921fc38 R15: 00007eff4f946040 [ 956.841562] </TASK> [ 956.841787] irq event stamp: 881 [ 956.842145] hardirqs last enabled at (889): [<ffffffff831d7f49>] console_unlock+0x109/0x130 [ 956.843013] hardirqs last disabled at (896): [<ffffffff831d7f2e>] console_unlock+0xee/0x130 [ 956.843648] softirqs last enabled at (694): [<ffffffff8314ba2a>] irq_exit_rcu+0x8a/0xe0 [ 956.844276] softirqs last disabled at (689): [<ffffffff8314ba2a>] irq_exit_rcu+0x8a/0xe0 [ 956.844893] ---[ end trace 0000000000000000 ]---
Warning is from this code: static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; }
It seems like there might be a timing window or an incorrect call to trace_array_put() somewhere. Do you think this is related to the eventfs work?
Thanks, -Beau
On Fri, 27 Oct 2023 12:20:11 -0700 Beau Belgrave beaub@linux.microsoft.com wrote:
No, I think this is was probably introduced by:
a1f157c7a3bb ("tracing: Expand all ring buffers individually")
Or possibly a mixture of the two changes? But anyway I think I need to look at this one first.
-- Steve
On Fri, 27 Oct 2023 18:36:40 -0400 Steven Rostedt rostedt@goodmis.org wrote:
Not sure if the bug Beau hit is the same as this one, but the one Beau hit I think is fixed by this:
-- Steve
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 4d2da7480e5f..4a54493b8419 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -234,6 +234,10 @@ create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry, bool invalidate = false;
mutex_lock(&eventfs_mutex); + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } /* If the e_dentry already has a dentry, use it */ if (*e_dentry) { /* lookup does not need to up the ref count */
On Fri, Oct 27, 2023 at 10:33:44PM -0400, Steven Rostedt wrote:
I applied both [1][2] patches, and I no longer get any panics. However, I still get the splat about the trace_array_put when running user_event's ftrace selftest:
[ 26.665931] ------------[ cut here ]------------ [ 26.666663] WARNING: CPU: 12 PID: 291 at kernel/trace/trace.c:516 tracing_release_file_tr+0x46/0x50 [ 26.667470] Modules linked in: [ 26.667808] CPU: 12 PID: 291 Comm: ftrace_test Not tainted 6.6.0-rc7-next-20231026 #3 [ 26.668665] RIP: 0010:tracing_release_file_tr+0x46/0x50 [ 26.669093] Code: d1 03 01 8b 83 c0 1e 00 00 85 c0 74 1d 83 e8 01 48 c7 c7 80 5b ef bc 89 83 c0 1e 00 00 e8 f2 b5 03 01 31 c0 5b e9 75 ee 27 01 <0f> 0b eb df 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 [ 26.670580] RSP: 0018:ffffb6ef858ffee8 EFLAGS: 00010246 [ 26.671128] RAX: 0000000000000000 RBX: ffff9d7ae2364058 RCX: 0000000000000000 [ 26.671793] RDX: 0000000000000000 RSI: ffffffffbcb6b38b RDI: 00000000ffffffff [ 26.672444] RBP: ffff9d7ac3e72200 R08: 0000000000000000 R09: 0000000000000000 [ 26.673072] R10: ffffb6ef858ffee8 R11: ffffffffbb28526f R12: 00000000000f801f [ 26.673705] R13: ffff9d7b661a2020 R14: ffff9d7ac6057728 R15: 0000000000000000 [ 26.674339] FS: 00007fa852fa6740(0000) GS:ffff9d81a6300000(0000) knlGS:0000000000000000 [ 26.674978] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 26.675506] CR2: 00007fa852c2a250 CR3: 0000000105d92001 CR4: 0000000000370eb0 [ 26.676142] Call Trace: [ 26.676357] <TASK> [ 26.676572] ? __warn+0x7f/0x160 [ 26.677092] ? tracing_release_file_tr+0x46/0x50 [ 26.677540] ? report_bug+0x1c3/0x1d0 [ 26.677871] ? handle_bug+0x3c/0x70 [ 26.678196] ? exc_invalid_op+0x14/0x70 [ 26.678520] ? asm_exc_invalid_op+0x16/0x20 [ 26.678845] ? tracing_release_file_tr+0x1f/0x50 [ 26.679268] ? tracing_release_file_tr+0x46/0x50 [ 26.679691] ? tracing_release_file_tr+0x1f/0x50 [ 26.680105] __fput+0xab/0x300 [ 26.680437] __x64_sys_close+0x38/0x80 [ 26.680757] do_syscall_64+0x41/0xf0 [ 26.681329] entry_SYSCALL_64_after_hwframe+0x6c/0x74 [ 26.681784] RIP: 0033:0x7fa852d15157 [ 26.682126] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 83 b8 f7 ff [ 26.684255] RSP: 002b:00007ffd226914f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 26.684939] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa852d15157 [ 26.685602] RDX: 0000000000000002 RSI: 0000560ad54567d4 RDI: 0000000000000005 [ 26.686257] RBP: 00007ffd22691520 R08: 0000000000000000 R09: 00007fa852fa6740 [ 26.686877] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffd22691758 [ 26.687480] R13: 0000560ad5455ae7 R14: 0000560ad5458c38 R15: 00007fa8530d4040 [ 26.688107] </TASK> [ 26.688315] irq event stamp: 1361 [ 26.688618] hardirqs last enabled at (1369): [<ffffffffbb1d7f49>] console_unlock+0x109/0x130 [ 26.689339] hardirqs last disabled at (1376): [<ffffffffbb1d7f2e>] console_unlock+0xee/0x130 [ 26.690062] softirqs last enabled at (1272): [<ffffffffbb14ba2a>] irq_exit_rcu+0x8a/0xe0 [ 26.690774] softirqs last disabled at (1267): [<ffffffffbb14ba2a>] irq_exit_rcu+0x8a/0xe0 [ 26.691620] ---[ end trace 0000000000000000 ]---
Thanks, -Beau
1. https://lore.kernel.org/linux-trace-kernel/20231030115018.25399dbd@gandalf.l... 2. https://lore.kernel.org/linux-trace-kernel/20231028164650.4f5ea18a@rorschach...
On Mon, 30 Oct 2023 09:31:02 -0700 Beau Belgrave beaub@linux.microsoft.com wrote:
I applied both [1][2] patches, and I no longer get any panics. However,
Great! Can I add "Tested-by" from you on those patches?
Hmm, this doesn't tell me much. Let me go play with the user_event self tests.
Thanks Beau!
-- Steve
On Mon, 30 Oct 2023 12:42:23 -0400 Steven Rostedt rostedt@goodmis.org wrote:
I added a bunch of printk()s and I'm thinking there's a race in user event (or dynamic event) code.
I put a printk in the open, write and release call to record the filp and the file in the open and this is what I hit:
[ 32.603954] open ffff8d05488bf000 file=ffff8d0484f7a688 [ 32.607026] write ffff8d05488bf000 [ 32.608829] update file = ffff8d0484f7a688 [ 32.610100] update tr = ffffffffb2bebda0 [ 32.622203] write ffff8d05488bf000 [ 32.623231] update file = ffff8d0484f7a688 [ 32.624397] update tr = ffffffffb2bebda0 [ 32.625975] call delayed destroy [ 32.627241] open ffff8d048510fc00 file=ffff8d0484f7a688
Another open with a different filp, but has the same file pointer (which is the meta data that matches the eventfs files, but not part of eventfs).
[ 32.628720] release ffff8d048510fc00
Only one release is called and then we call:
[ 32.630785] Remove event call ffff8d04809d7c58
The above "Remove event call" came from user_event_set_call_visible()
if (visible) { printk("show event call %px\n", &user->call); ret = trace_add_event_call(&user->call); } else { printk("Remove event call %px\n", &user->call); ret = trace_remove_event_call(&user->call); }
Where trace_remove_event_call() calls:
probe_remove_event_call() { __trace_remove_event_call() { event_remove() { remove_event_from_tracers() { remove_event_file_dir() { remove_event_file_dir() { kmem_cache_free(file_cachep, file);
That is, call->file is freed at this point. Now any access to the file pointer is going to be garbage, which a write to enable will cause.
So I see it is freed without seeing a release called.
[ 32.632323] Removing __test_event
The above is the dentry being released (unlinked).
[ 32.633618] Removing enable [ 32.634453] Removing user_events [ 32.634948] write ffff8d05488bf000
Another write is happening to the user event file (it may have been unlinked, but the release was never called, so it is still valid.
[ 32.636440] update file = ffff8d0484f7a688 [ 32.637685] update tr = dc64cc323d943921
The above is the file pointer that we freed, and you can see the file->tr is now garbage.
[ 32.638827] general protection fault, probably for non-canonical address 0xdc64cc323d9457f9: 0000 [#1] PREEMPT SMP PTI [ 32.641712] CPU: 4 PID: 911 Comm: ftrace_test Not tainted 6.6.0-rc4-test-00024-gd402dc722a1b-dirty #158 [ 32.644220] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 32.646717] RIP: 0010:tracing_update_buffers+0x19/0x50 [ 32.648133] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 55 31 f6 31 ed 53 48 89 fb 48 c7 c7 e0 bc be b2 e8 d7 b6 cb 00 <80> bb d8 1e 00 00 00 74 15 48 c7 c7 e0 bc be b2 e8 42 9f cb 00 89 [ 32.653125] RSP: 0018:ffffb7b34172be20 EFLAGS: 00010246 [ 32.654573] RAX: 0000000000000000 RBX: dc64cc323d943921 RCX: 0000000000000000 [ 32.656503] RDX: 0000000000000000 RSI: ffffffffb1e09128 RDI: ffffffffb1e09128 [ 32.658445] RBP: 0000000000000000 R08: 000000000000002f R09: 0000000000000000 [ 32.660378] R10: ffffb7b34172be20 R11: 0000000000000001 R12: ffffb7b34172bf08 [ 32.662322] R13: ffff8d0484f7a688 R14: ffffb7b34172bf08 R15: 0000000000000000 [ 32.664249] FS: 00007f1769a10740(0000) GS:ffff8d05f7c00000(0000) knlGS:0000000000000000 [ 32.666438] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 32.668009] CR2: 000055c0cd72fc70 CR3: 000000010e3c6003 CR4: 0000000000170ee0 [ 32.669954] Call Trace: [ 32.670653] <TASK> [ 32.671263] ? die_addr+0x36/0x90 [ 32.672203] ? exc_general_protection+0x236/0x4a0 [ 32.673521] ? asm_exc_general_protection+0x26/0x30 [ 32.674872] ? __mutex_lock+0x1c8/0xb40 [ 32.675942] ? __mutex_lock+0x1c8/0xb40 [ 32.677009] ? tracing_update_buffers+0x19/0x50 [ 32.678259] event_enable_write+0xb4/0x140 [ 32.679311] vfs_write+0xf2/0x530 [ 32.680172] ? find_held_lock+0x2b/0x80 [ 32.681159] ? _raw_spin_unlock+0x2d/0x50 [ 32.682711] ? rcu_is_watching+0x11/0x50 [ 32.684159] ? _raw_spin_unlock+0x2d/0x50 [ 32.685642] ? trace_preempt_on+0x78/0x80 [ 32.687027] ksys_write+0x75/0x100 [ 32.688257] do_syscall_64+0x3f/0xc0 [ 32.689561] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 32.691048] RIP: 0033:0x7f1769b0ab00
Note, I think I can simplify some of the code (but not fix this bug) by letting the dput free the eventfs_inode as well (I think that is required). But the above looks to be a bug in the implementation of user_events.
-- Steve
On Mon, Oct 30, 2023 at 05:31:51PM -0400, Steven Rostedt wrote:
I did as well, however, I don't see how user events would be involved other than allowing a trace_remove_event_call() with open enable fds?
I believe the scenario is open the enable file and keep the fd open.
While the fd is open to the enable file, call trace_remove_event_call().
If trace_remove_event_call() is called for an event with a tr->ref > 0, should it fail or work? (It currently works without issue.)
Should writes to the fd still work after the event it is related to has been removed?
I don't see how user_events could prevent this, it seems trace_remove_event_call() should fail if files for it are still open?
Thanks, -Beau
[ Adding Masami and stable ]
On Tue, 31 Oct 2023 00:27:07 +0000 Beau Belgrave beaub@linux.microsoft.com wrote:
This is a separate issue from eventfs (good, because I think I have solved all the known bugs for that one - phew!).
Anyway, I checkout the code just before adding the eventfs, and did the following:
# echo 'p:sched schedule' > /sys/kernel/tracing/kprobe_events # exec 5>>/sys/kernel/tracing/events/kprobes/sched/enable # > /sys/kernel/tracing/kprobe_events # exec 5>&-
And it worked fine. The above creates a kprobe event, opens the enable file of that event with the bash file descriptor #5, removes the kprobe event, and then closes the file descriptor #5.
But the I applied:
f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files")
And do the above commands again and BOOM! it crashes with:
[ 217.879087] BUG: kernel NULL pointer dereference, address: 0000000000000028 [ 217.881121] #PF: supervisor read access in kernel mode [ 217.882532] #PF: error_code(0x0000) - not-present page [ 217.883932] PGD 0 P4D 0 [ 217.884672] Oops: 0000 [#1] PREEMPT SMP PTI [ 217.885821] CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186 [ 217.888178] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 217.890684] RIP: 0010:tracing_release_file_tr+0xc/0x50 [ 217.892097] Code: 5d 41 5c c3 cc cc cc cc 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 53 48 8b 87 80 04 00 00 <48> 8b 58 28 48 85 db 74 2d 31 f6 48 c7 c7 c0 b3 1e 93 e8 2d 48 ca [ 217.897102] RSP: 0018:ffffa5d400587eb0 EFLAGS: 00010282 [ 217.898531] RAX: 0000000000000000 RBX: ffff907d06aa6c00 RCX: 0000000000000000 [ 217.900471] RDX: 0000000000000000 RSI: ffff907d06aa6c00 RDI: ffff907d0bf21bd0 [ 217.902403] RBP: 00000000000d801e R08: 0000000000000001 R09: ffff907d0bf21bd0 [ 217.904350] R10: 0000000000000001 R11: 0000000000000001 R12: ffff907d0bf21bd0 [ 217.906282] R13: ffff907d103708e0 R14: ffff907d0a178c30 R15: 0000000000000000 [ 217.908215] FS: 00007ff49c150740(0000) GS:ffff907e77d00000(0000) knlGS:0000000000000000 [ 217.910405] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 217.911970] CR2: 0000000000000028 CR3: 00000001051ec005 CR4: 0000000000170ee0 [ 217.913924] Call Trace: [ 217.914624] <TASK> [ 217.915232] ? __die+0x23/0x70 [ 217.916105] ? page_fault_oops+0x17d/0x4d0 [ 217.917262] ? exc_page_fault+0x7f/0x200 [ 217.918350] ? asm_exc_page_fault+0x26/0x30 [ 217.919513] ? tracing_release_file_tr+0xc/0x50 [ 217.920780] __fput+0xfb/0x2a0 [ 217.921651] task_work_run+0x5d/0x90 [ 217.922652] exit_to_user_mode_prepare+0x231/0x240 [ 217.923981] syscall_exit_to_user_mode+0x1a/0x50 [ 217.925248] do_syscall_64+0x4b/0xc0 [ 217.926176] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
Look familiar?
It's now midnight and I've been at this all day. I'm going to look more at this tomorrow. It's not going to be easy :-( I'm not sure what exactly to do. We may need to prevent dynamic events from being deleted if there's any of its files opened (enable, format, etc).
That is, if you try to delete the event, it will give you an -EBUSY, just like having them enabled would.
More good news, that commit is in 6.6 *and* marked for stable :-p
-- Steve
On Tue, 31 Oct 2023 00:00:31 -0400 Steven Rostedt rostedt@goodmis.org wrote:
The below patch appears to solve this. Care to test it?
-- Steve
Signed-off-by: Steven Rostedt (Google) rostedt@goodmis.org --- include/linux/trace_events.h | 4 ++++ kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 4 ++++ kernel/trace/trace_events.c | 29 +++++++++++++++++++++++++---- kernel/trace/trace_events_filter.c | 3 +++ 5 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12207dc6722d..696f8dc4aa53 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -492,6 +492,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, + EVENT_FILE_FL_FREED_BIT, };
extern struct trace_event_file *trace_get_event_file(const char *instance, @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal + * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), @@ -643,6 +645,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), };
struct trace_event_file { @@ -671,6 +674,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; + atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2539cfc20a97..9aebf904ff97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret;
+ mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + filp->private_data = inode->i_private;
return 0; @@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) struct trace_event_file *file = inode->i_private;
trace_array_put(file->tr); + event_file_put(file);
return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e1405abf4f7..4fcd293ecbd4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1669,6 +1669,10 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data);
+extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 70a029ae3c65..ec86f4894455 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -990,13 +990,33 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) } }
+void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref) && + (file->flags & EVENT_FILE_FL_FREED)) { + kmem_cache_free(file_cachep, file); + } +} + static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); }
/* @@ -1369,7 +1389,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex);
- if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED && @@ -1404,7 +1424,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) { + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { printk("update file = %px\n", file); printk("update tr = %px\n", file->tr); ret = tracing_update_buffers(file->tr); @@ -1686,7 +1706,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex);
@@ -2905,6 +2925,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file);
return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err;
+ if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file);
On Tue, 31 Oct 2023 10:45:51 -0400 Steven Rostedt rostedt@goodmis.org wrote:
Well, this won't apply because I still had debugging in it when I added these changes.
Here's a better version:
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12207dc6722d..696f8dc4aa53 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -492,6 +492,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, + EVENT_FILE_FL_FREED_BIT, };
extern struct trace_event_file *trace_get_event_file(const char *instance, @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal + * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), @@ -643,6 +645,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), };
struct trace_event_file { @@ -671,6 +674,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; + atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2539cfc20a97..9aebf904ff97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret;
+ mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + filp->private_data = inode->i_private;
return 0; @@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) struct trace_event_file *file = inode->i_private;
trace_array_put(file->tr); + event_file_put(file);
return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e1405abf4f7..b7f4ea25a194 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data);
+extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f9e3e24d8796..f29e815ca5b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) } }
+void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); + } +} + static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); }
/* @@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex);
- if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED && @@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) { + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex);
@@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file);
return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err;
+ if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file);
On Tue, Oct 31, 2023 at 12:07:02PM -0400, Steven Rostedt wrote:
Thanks!
I ended up testing the official patch [1] instead of this one along with patches [2][3] in the next branch.
My printk's are now good, everything looks back to normal for my repro.
I added my Tested-by onto the official patch thread.
Thanks, -Beau
1. https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.l... 2. https://lore.kernel.org/linux-trace-kernel/20231030114047.759c7bdf@gandalf.l... 3. https://lore.kernel.org/linux-trace-kernel/20231028164650.4f5ea18a@rorschach...