Add the support for cpuv4 instructions for ARM32 BPF JIT. 64-bit division was not supported earlier so this series adds 64-bit DIV, SDIV, MOD, SMOD instructions.
This series needs any one of the patches from [1] to support ldsx.
The relevant selftests have passed expect ldsx_insn which needs fentry:
Tested on BeagleBone Black (ARMv7-A):
[root@alarm del]# echo 1 > /proc/sys/net/core/bpf_jit_enable [root@alarm del]# ./test_progs -a verifier_sdiv,verifier_movsx,verifier_ldsx,verifier_gotol,verifier_bswap #337/1 verifier_bswap/BSWAP, 16:OK #337/2 verifier_bswap/BSWAP, 16 @unpriv:OK #337/3 verifier_bswap/BSWAP, 32:OK #337/4 verifier_bswap/BSWAP, 32 @unpriv:OK #337/5 verifier_bswap/BSWAP, 64:OK #337/6 verifier_bswap/BSWAP, 64 @unpriv:OK #337 verifier_bswap:OK #351/1 verifier_gotol/gotol, small_imm:OK #351/2 verifier_gotol/gotol, small_imm @unpriv:OK #351 verifier_gotol:OK #359/1 verifier_ldsx/LDSX, S8:OK #359/2 verifier_ldsx/LDSX, S8 @unpriv:OK #359/3 verifier_ldsx/LDSX, S16:OK #359/4 verifier_ldsx/LDSX, S16 @unpriv:OK #359/5 verifier_ldsx/LDSX, S32:OK #359/6 verifier_ldsx/LDSX, S32 @unpriv:OK #359/7 verifier_ldsx/LDSX, S8 range checking, privileged:OK #359/8 verifier_ldsx/LDSX, S16 range checking:OK #359/9 verifier_ldsx/LDSX, S16 range checking @unpriv:OK #359/10 verifier_ldsx/LDSX, S32 range checking:OK #359/11 verifier_ldsx/LDSX, S32 range checking @unpriv:OK #359 verifier_ldsx:OK #370/1 verifier_movsx/MOV32SX, S8:OK #370/2 verifier_movsx/MOV32SX, S8 @unpriv:OK #370/3 verifier_movsx/MOV32SX, S16:OK #370/4 verifier_movsx/MOV32SX, S16 @unpriv:OK #370/5 verifier_movsx/MOV64SX, S8:OK #370/6 verifier_movsx/MOV64SX, S8 @unpriv:OK #370/7 verifier_movsx/MOV64SX, S16:OK #370/8 verifier_movsx/MOV64SX, S16 @unpriv:OK #370/9 verifier_movsx/MOV64SX, S32:OK #370/10 verifier_movsx/MOV64SX, S32 @unpriv:OK #370/11 verifier_movsx/MOV32SX, S8, range_check:OK #370/12 verifier_movsx/MOV32SX, S8, range_check @unpriv:OK #370/13 verifier_movsx/MOV32SX, S16, range_check:OK #370/14 verifier_movsx/MOV32SX, S16, range_check @unpriv:OK #370/15 verifier_movsx/MOV32SX, S16, range_check 2:OK #370/16 verifier_movsx/MOV32SX, S16, range_check 2 @unpriv:OK #370/17 verifier_movsx/MOV64SX, S8, range_check:OK #370/18 verifier_movsx/MOV64SX, S8, range_check @unpriv:OK #370/19 verifier_movsx/MOV64SX, S16, range_check:OK #370/20 verifier_movsx/MOV64SX, S16, range_check @unpriv:OK #370/21 verifier_movsx/MOV64SX, S32, range_check:OK #370/22 verifier_movsx/MOV64SX, S32, range_check @unpriv:OK #370/23 verifier_movsx/MOV64SX, S16, R10 Sign Extension:OK #370/24 verifier_movsx/MOV64SX, S16, R10 Sign Extension @unpriv:OK #370 verifier_movsx:OK #382/1 verifier_sdiv/SDIV32, non-zero imm divisor, check 1:OK #382/2 verifier_sdiv/SDIV32, non-zero imm divisor, check 1 @unpriv:OK #382/3 verifier_sdiv/SDIV32, non-zero imm divisor, check 2:OK #382/4 verifier_sdiv/SDIV32, non-zero imm divisor, check 2 @unpriv:OK #382/5 verifier_sdiv/SDIV32, non-zero imm divisor, check 3:OK #382/6 verifier_sdiv/SDIV32, non-zero imm divisor, check 3 @unpriv:OK #382/7 verifier_sdiv/SDIV32, non-zero imm divisor, check 4:OK #382/8 verifier_sdiv/SDIV32, non-zero imm divisor, check 4 @unpriv:OK #382/9 verifier_sdiv/SDIV32, non-zero imm divisor, check 5:OK #382/10 verifier_sdiv/SDIV32, non-zero imm divisor, check 5 @unpriv:OK #382/11 verifier_sdiv/SDIV32, non-zero imm divisor, check 6:OK #382/12 verifier_sdiv/SDIV32, non-zero imm divisor, check 6 @unpriv:OK #382/13 verifier_sdiv/SDIV32, non-zero imm divisor, check 7:OK #382/14 verifier_sdiv/SDIV32, non-zero imm divisor, check 7 @unpriv:OK #382/15 verifier_sdiv/SDIV32, non-zero imm divisor, check 8:OK #382/16 verifier_sdiv/SDIV32, non-zero imm divisor, check 8 @unpriv:OK #382/17 verifier_sdiv/SDIV32, non-zero reg divisor, check 1:OK #382/18 verifier_sdiv/SDIV32, non-zero reg divisor, check 1 @unpriv:OK #382/19 verifier_sdiv/SDIV32, non-zero reg divisor, check 2:OK #382/20 verifier_sdiv/SDIV32, non-zero reg divisor, check 2 @unpriv:OK #382/21 verifier_sdiv/SDIV32, non-zero reg divisor, check 3:OK #382/22 verifier_sdiv/SDIV32, non-zero reg divisor, check 3 @unpriv:OK #382/23 verifier_sdiv/SDIV32, non-zero reg divisor, check 4:OK #382/24 verifier_sdiv/SDIV32, non-zero reg divisor, check 4 @unpriv:OK #382/25 verifier_sdiv/SDIV32, non-zero reg divisor, check 5:OK #382/26 verifier_sdiv/SDIV32, non-zero reg divisor, check 5 @unpriv:OK #382/27 verifier_sdiv/SDIV32, non-zero reg divisor, check 6:OK #382/28 verifier_sdiv/SDIV32, non-zero reg divisor, check 6 @unpriv:OK #382/29 verifier_sdiv/SDIV32, non-zero reg divisor, check 7:OK #382/30 verifier_sdiv/SDIV32, non-zero reg divisor, check 7 @unpriv:OK #382/31 verifier_sdiv/SDIV32, non-zero reg divisor, check 8:OK #382/32 verifier_sdiv/SDIV32, non-zero reg divisor, check 8 @unpriv:OK #382/33 verifier_sdiv/SDIV64, non-zero imm divisor, check 1:OK #382/34 verifier_sdiv/SDIV64, non-zero imm divisor, check 1 @unpriv:OK #382/35 verifier_sdiv/SDIV64, non-zero imm divisor, check 2:OK #382/36 verifier_sdiv/SDIV64, non-zero imm divisor, check 2 @unpriv:OK #382/37 verifier_sdiv/SDIV64, non-zero imm divisor, check 3:OK #382/38 verifier_sdiv/SDIV64, non-zero imm divisor, check 3 @unpriv:OK #382/39 verifier_sdiv/SDIV64, non-zero imm divisor, check 4:OK #382/40 verifier_sdiv/SDIV64, non-zero imm divisor, check 4 @unpriv:OK #382/41 verifier_sdiv/SDIV64, non-zero imm divisor, check 5:OK #382/42 verifier_sdiv/SDIV64, non-zero imm divisor, check 5 @unpriv:OK #382/43 verifier_sdiv/SDIV64, non-zero imm divisor, check 6:OK #382/44 verifier_sdiv/SDIV64, non-zero imm divisor, check 6 @unpriv:OK #382/45 verifier_sdiv/SDIV64, non-zero reg divisor, check 1:OK #382/46 verifier_sdiv/SDIV64, non-zero reg divisor, check 1 @unpriv:OK #382/47 verifier_sdiv/SDIV64, non-zero reg divisor, check 2:OK #382/48 verifier_sdiv/SDIV64, non-zero reg divisor, check 2 @unpriv:OK #382/49 verifier_sdiv/SDIV64, non-zero reg divisor, check 3:OK #382/50 verifier_sdiv/SDIV64, non-zero reg divisor, check 3 @unpriv:OK #382/51 verifier_sdiv/SDIV64, non-zero reg divisor, check 4:OK #382/52 verifier_sdiv/SDIV64, non-zero reg divisor, check 4 @unpriv:OK #382/53 verifier_sdiv/SDIV64, non-zero reg divisor, check 5:OK #382/54 verifier_sdiv/SDIV64, non-zero reg divisor, check 5 @unpriv:OK #382/55 verifier_sdiv/SDIV64, non-zero reg divisor, check 6:OK #382/56 verifier_sdiv/SDIV64, non-zero reg divisor, check 6 @unpriv:OK #382/57 verifier_sdiv/SMOD32, non-zero imm divisor, check 1:OK #382/58 verifier_sdiv/SMOD32, non-zero imm divisor, check 1 @unpriv:OK #382/59 verifier_sdiv/SMOD32, non-zero imm divisor, check 2:OK #382/60 verifier_sdiv/SMOD32, non-zero imm divisor, check 2 @unpriv:OK #382/61 verifier_sdiv/SMOD32, non-zero imm divisor, check 3:OK #382/62 verifier_sdiv/SMOD32, non-zero imm divisor, check 3 @unpriv:OK #382/63 verifier_sdiv/SMOD32, non-zero imm divisor, check 4:OK #382/64 verifier_sdiv/SMOD32, non-zero imm divisor, check 4 @unpriv:OK #382/65 verifier_sdiv/SMOD32, non-zero imm divisor, check 5:OK #382/66 verifier_sdiv/SMOD32, non-zero imm divisor, check 5 @unpriv:OK #382/67 verifier_sdiv/SMOD32, non-zero imm divisor, check 6:OK #382/68 verifier_sdiv/SMOD32, non-zero imm divisor, check 6 @unpriv:OK #382/69 verifier_sdiv/SMOD32, non-zero reg divisor, check 1:OK #382/70 verifier_sdiv/SMOD32, non-zero reg divisor, check 1 @unpriv:OK #382/71 verifier_sdiv/SMOD32, non-zero reg divisor, check 2:OK #382/72 verifier_sdiv/SMOD32, non-zero reg divisor, check 2 @unpriv:OK #382/73 verifier_sdiv/SMOD32, non-zero reg divisor, check 3:OK #382/74 verifier_sdiv/SMOD32, non-zero reg divisor, check 3 @unpriv:OK #382/75 verifier_sdiv/SMOD32, non-zero reg divisor, check 4:OK #382/76 verifier_sdiv/SMOD32, non-zero reg divisor, check 4 @unpriv:OK #382/77 verifier_sdiv/SMOD32, non-zero reg divisor, check 5:OK #382/78 verifier_sdiv/SMOD32, non-zero reg divisor, check 5 @unpriv:OK #382/79 verifier_sdiv/SMOD32, non-zero reg divisor, check 6:OK #382/80 verifier_sdiv/SMOD32, non-zero reg divisor, check 6 @unpriv:OK #382/81 verifier_sdiv/SMOD64, non-zero imm divisor, check 1:OK #382/82 verifier_sdiv/SMOD64, non-zero imm divisor, check 1 @unpriv:OK #382/83 verifier_sdiv/SMOD64, non-zero imm divisor, check 2:OK #382/84 verifier_sdiv/SMOD64, non-zero imm divisor, check 2 @unpriv:OK #382/85 verifier_sdiv/SMOD64, non-zero imm divisor, check 3:OK #382/86 verifier_sdiv/SMOD64, non-zero imm divisor, check 3 @unpriv:OK #382/87 verifier_sdiv/SMOD64, non-zero imm divisor, check 4:OK #382/88 verifier_sdiv/SMOD64, non-zero imm divisor, check 4 @unpriv:OK #382/89 verifier_sdiv/SMOD64, non-zero imm divisor, check 5:OK #382/90 verifier_sdiv/SMOD64, non-zero imm divisor, check 5 @unpriv:OK #382/91 verifier_sdiv/SMOD64, non-zero imm divisor, check 6:OK #382/92 verifier_sdiv/SMOD64, non-zero imm divisor, check 6 @unpriv:OK #382/93 verifier_sdiv/SMOD64, non-zero imm divisor, check 7:OK #382/94 verifier_sdiv/SMOD64, non-zero imm divisor, check 7 @unpriv:OK #382/95 verifier_sdiv/SMOD64, non-zero imm divisor, check 8:OK #382/96 verifier_sdiv/SMOD64, non-zero imm divisor, check 8 @unpriv:OK #382/97 verifier_sdiv/SMOD64, non-zero reg divisor, check 1:OK #382/98 verifier_sdiv/SMOD64, non-zero reg divisor, check 1 @unpriv:OK #382/99 verifier_sdiv/SMOD64, non-zero reg divisor, check 2:OK #382/100 verifier_sdiv/SMOD64, non-zero reg divisor, check 2 @unpriv:OK #382/101 verifier_sdiv/SMOD64, non-zero reg divisor, check 3:OK #382/102 verifier_sdiv/SMOD64, non-zero reg divisor, check 3 @unpriv:OK #382/103 verifier_sdiv/SMOD64, non-zero reg divisor, check 4:OK #382/104 verifier_sdiv/SMOD64, non-zero reg divisor, check 4 @unpriv:OK #382/105 verifier_sdiv/SMOD64, non-zero reg divisor, check 5:OK #382/106 verifier_sdiv/SMOD64, non-zero reg divisor, check 5 @unpriv:OK #382/107 verifier_sdiv/SMOD64, non-zero reg divisor, check 6:OK #382/108 verifier_sdiv/SMOD64, non-zero reg divisor, check 6 @unpriv:OK #382/109 verifier_sdiv/SMOD64, non-zero reg divisor, check 7:OK #382/110 verifier_sdiv/SMOD64, non-zero reg divisor, check 7 @unpriv:OK #382/111 verifier_sdiv/SMOD64, non-zero reg divisor, check 8:OK #382/112 verifier_sdiv/SMOD64, non-zero reg divisor, check 8 @unpriv:OK #382/113 verifier_sdiv/SDIV32, zero divisor:OK #382/114 verifier_sdiv/SDIV32, zero divisor @unpriv:OK #382/115 verifier_sdiv/SDIV64, zero divisor:OK #382/116 verifier_sdiv/SDIV64, zero divisor @unpriv:OK #382/117 verifier_sdiv/SMOD32, zero divisor:OK #382/118 verifier_sdiv/SMOD32, zero divisor @unpriv:OK #382/119 verifier_sdiv/SMOD64, zero divisor:OK #382/120 verifier_sdiv/SMOD64, zero divisor @unpriv:OK #382 verifier_sdiv:OK Summary: 5/163 PASSED, 0 SKIPPED, 0 FAILED
As the selftests don't compile for 32-bit architectures without modifications I have added new tests to lib/test_bpf.c for cpuv4 insns:
test_bpf: Summary: 1052 PASSED, 0 FAILED, [891/1040 JIT'ed] test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed] test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED
[1] https://lore.kernel.org/all/mb61p5y4u3ptd.fsf@amazon.com/
Puranjay Mohan (8): arm32, bpf: add support for 32-bit offset jmp instruction arm32, bpf: add support for sign-extension load instruction arm32, bpf: add support for sign-extension mov instruction arm32, bpf: add support for unconditional bswap instruction arm32, bpf: add support for 32-bit signed division arm32, bpf: add support for 64 bit division instruction selftest, bpf: enable cpu v4 tests for arm32 bpf/tests: add tests for cpuv4 instructions
arch/arm/net/bpf_jit_32.c | 250 +++++++++++- arch/arm/net/bpf_jit_32.h | 4 + include/linux/filter.h | 50 ++- lib/test_bpf.c | 371 ++++++++++++++++++ .../selftests/bpf/progs/verifier_bswap.c | 3 +- .../selftests/bpf/progs/verifier_gotol.c | 3 +- .../selftests/bpf/progs/verifier_ldsx.c | 3 +- .../selftests/bpf/progs/verifier_movsx.c | 3 +- .../selftests/bpf/progs/verifier_sdiv.c | 3 +- 9 files changed, 665 insertions(+), 25 deletions(-)
The cpuv4 adds unconditional jump with 32-bit offset where the immediate field of the instruction is to be used to calculate the jump offset.
BPF_JA | BPF_K | BPF_JMP32 => gotol +imm => PC += imm.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 6a1c9fca5260..b26579da770e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1761,10 +1761,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; /* JMP OFF */ case BPF_JMP | BPF_JA: + case BPF_JMP32 | BPF_JA: { - if (off == 0) + if (BPF_CLASS(code) == BPF_JMP32 && imm != 0) + jmp_offset = bpf2a32_offset(i + imm, i, ctx); + else if (BPF_CLASS(code) == BPF_JMP && off != 0) + jmp_offset = bpf2a32_offset(i + off, i, ctx); + else break; - jmp_offset = bpf2a32_offset(i+off, i, ctx); + check_imm24(jmp_offset); emit(ARM_B(jmp_offset), ctx); break;
On Tue, Sep 05, 2023 at 09:06:14PM +0000, Puranjay Mohan wrote:
The cpuv4 adds unconditional jump with 32-bit offset where the immediate field of the instruction is to be used to calculate the jump offset.
BPF_JA | BPF_K | BPF_JMP32 => gotol +imm => PC += imm.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com
arch/arm/net/bpf_jit_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 6a1c9fca5260..b26579da770e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1761,10 +1761,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; /* JMP OFF */ case BPF_JMP | BPF_JA:
- case BPF_JMP32 | BPF_JA: {
if (off == 0)
if (BPF_CLASS(code) == BPF_JMP32 && imm != 0)
jmp_offset = bpf2a32_offset(i + imm, i, ctx);
else if (BPF_CLASS(code) == BPF_JMP && off != 0)
Please tidy up the coding style - one space between "&&" and "off"
Thanks.
The cpuv4 added the support of an instruction that is similar to load but also sign-extends the result after the load.
BPF_MEMSX | <size> | BPF_LDX means dst = *(signed size *) (src + offset) here <size> can be one of BPF_B, BPF_H, BPF_W.
ARM32 has instructions to load a byte or a half word with sign extension into a 32bit register. As the JIT uses two 32 bit registers to simulate a 64-bit BPF register, an extra instruction is emitted to sign-extent the result up to the second register.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 69 ++++++++++++++++++++++++++++++++++++++- arch/arm/net/bpf_jit_32.h | 2 ++ 2 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index b26579da770e..f7c162479cf2 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -333,6 +333,9 @@ static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8) #define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off) #define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off)
+#define ARM_LDRSH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSH_I, rt, rn, off) +#define ARM_LDRSB_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSB_I, rt, rn, off) + #define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off) #define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off) #define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off) @@ -1026,6 +1029,24 @@ static bool is_ldst_imm(s16 off, const u8 size) return -off_max <= off && off <= off_max; }
+static bool is_ldst_imm8(s16 off, const u8 size) +{ + s16 off_max = 0; + + switch (size) { + case BPF_B: + off_max = 0xff; + break; + case BPF_W: + off_max = 0xfff; + break; + case BPF_H: + off_max = 0xff; + break; + } + return -off_max <= off && off <= off_max; +} + /* *(size *)(dst + off) = src */ static inline void emit_str_r(const s8 dst, const s8 src[], s16 off, struct jit_ctx *ctx, const u8 sz){ @@ -1105,6 +1126,45 @@ static inline void emit_ldx_r(const s8 dst[], const s8 src, arm_bpf_put_reg64(dst, rd, ctx); }
+/* dst = *(signed size*)(src + off) */ +static inline void emit_ldsx_r(const s8 dst[], const s8 src, + s16 off, struct jit_ctx *ctx, const u8 sz){ + const s8 *tmp = bpf2a32[TMP_REG_1]; + const s8 *rd = is_stacked(dst_lo) ? tmp : dst; + s8 rm = src; + + if (!is_ldst_imm8(off, sz)) { + emit_a32_mov_i(tmp[0], off, ctx); + emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); + rm = tmp[0]; + off = 0; + } else if (rd[1] == rm) { + emit(ARM_MOV_R(tmp[0], rm), ctx); + rm = tmp[0]; + } + switch (sz) { + case BPF_B: + /* Load a Byte with sign extension*/ + emit(ARM_LDRSB_I(rd[1], rm, off), ctx); + /* Carry the sign extension to upper 32 bits */ + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); + break; + case BPF_H: + /* Load a HalfWord with sign extension*/ + emit(ARM_LDRSH_I(rd[1], rm, off), ctx); + /* Carry the sign extension to upper 32 bits */ + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); + break; + case BPF_W: + /* Load a Word*/ + emit(ARM_LDR_I(rd[1], rm, off), ctx); + /* Carry the sign extension to upper 32 bits */ + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); + break; + } + arm_bpf_put_reg64(dst, rd, ctx); +} + /* Arithmatic Operation */ static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, const u8 rn, struct jit_ctx *ctx, u8 op, @@ -1603,8 +1663,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_LDX | BPF_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_DW: + /* LDSX: dst = *(signed size *)(src + off) */ + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); - emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); + if (BPF_MODE(insn->code) == BPF_MEMSX) + emit_ldsx_r(dst, rn, off, ctx, BPF_SIZE(code)); + else + emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; /* speculation barrier */ case BPF_ST | BPF_NOSPEC: diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h index e0b593a1498d..79c7373fadce 100644 --- a/arch/arm/net/bpf_jit_32.h +++ b/arch/arm/net/bpf_jit_32.h @@ -79,9 +79,11 @@ #define ARM_INST_LDST__IMM12 0x00000fff #define ARM_INST_LDRB_I 0x05500000 #define ARM_INST_LDRB_R 0x07d00000 +#define ARM_INST_LDRSB_I 0x015000d0 #define ARM_INST_LDRD_I 0x014000d0 #define ARM_INST_LDRH_I 0x015000b0 #define ARM_INST_LDRH_R 0x019000b0 +#define ARM_INST_LDRSH_I 0x015000f0 #define ARM_INST_LDR_I 0x05100000 #define ARM_INST_LDR_R 0x07900000
On Tue, Sep 05, 2023 at 09:06:15PM +0000, Puranjay Mohan wrote:
The cpuv4 added the support of an instruction that is similar to load but also sign-extends the result after the load.
BPF_MEMSX | <size> | BPF_LDX means dst = *(signed size *) (src + offset) here <size> can be one of BPF_B, BPF_H, BPF_W.
ARM32 has instructions to load a byte or a half word with sign extension into a 32bit register. As the JIT uses two 32 bit registers to simulate a 64-bit BPF register, an extra instruction is emitted to sign-extent the result up to the second register.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com
arch/arm/net/bpf_jit_32.c | 69 ++++++++++++++++++++++++++++++++++++++- arch/arm/net/bpf_jit_32.h | 2 ++ 2 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index b26579da770e..f7c162479cf2 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -333,6 +333,9 @@ static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8) #define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off) #define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off) +#define ARM_LDRSH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSH_I, rt, rn, off) +#define ARM_LDRSB_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSB_I, rt, rn, off)
#define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off) #define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off) #define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off) @@ -1026,6 +1029,24 @@ static bool is_ldst_imm(s16 off, const u8 size) return -off_max <= off && off <= off_max; } +static bool is_ldst_imm8(s16 off, const u8 size) +{
- s16 off_max = 0;
- switch (size) {
- case BPF_B:
off_max = 0xff;
break;
- case BPF_W:
off_max = 0xfff;
break;
- case BPF_H:
off_max = 0xff;
break;
- }
- return -off_max <= off && off <= off_max;
+}
/* *(size *)(dst + off) = src */ static inline void emit_str_r(const s8 dst, const s8 src[], s16 off, struct jit_ctx *ctx, const u8 sz){ @@ -1105,6 +1126,45 @@ static inline void emit_ldx_r(const s8 dst[], const s8 src, arm_bpf_put_reg64(dst, rd, ctx); } +/* dst = *(signed size*)(src + off) */ +static inline void emit_ldsx_r(const s8 dst[], const s8 src,
s16 off, struct jit_ctx *ctx, const u8 sz){
- const s8 *tmp = bpf2a32[TMP_REG_1];
- const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
- s8 rm = src;
- if (!is_ldst_imm8(off, sz)) {
emit_a32_mov_i(tmp[0], off, ctx);
emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
Hmm. This looks inefficient when "off" is able to fit in an immediate. Please try:
int add_off;
if (!is_ldst_imm8(off, sz)) { add_off = imm8m(off); if (add_off > 0) { emit(ARM_ADD_I(tmp[0], src, add_off), ctx); rm = tmp[0]; } else { emit_a32_mov_i(tmp[0], off, ctx); emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); rm = tmp[0]; } off = 0;
- } else if (rd[1] == rm) {
emit(ARM_MOV_R(tmp[0], rm), ctx);
rm = tmp[0];
Why do you need this? rd and rm can be the same for LDRS[BH].
- }
- switch (sz) {
- case BPF_B:
/* Load a Byte with sign extension*/
emit(ARM_LDRSB_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
break;
- case BPF_H:
/* Load a HalfWord with sign extension*/
emit(ARM_LDRSH_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
break;
- case BPF_W:
/* Load a Word*/
emit(ARM_LDR_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
The last instruction extending to the upper 32 bits is the same in each of these cases, so is there any reason not to do it outside the switch statement?
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
[...]
+/* dst = *(signed size*)(src + off) */ +static inline void emit_ldsx_r(const s8 dst[], const s8 src,
s16 off, struct jit_ctx *ctx, const u8 sz){
- const s8 *tmp = bpf2a32[TMP_REG_1];
- const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
- s8 rm = src;
- if (!is_ldst_imm8(off, sz)) {
emit_a32_mov_i(tmp[0], off, ctx);
emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
Hmm. This looks inefficient when "off" is able to fit in an immediate. Please try:
int add_off;
if (!is_ldst_imm8(off, sz)) { add_off = imm8m(off); if (add_off > 0) { emit(ARM_ADD_I(tmp[0], src, add_off), ctx); rm = tmp[0]; } else { emit_a32_mov_i(tmp[0], off, ctx); emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); rm = tmp[0]; } off = 0;
- } else if (rd[1] == rm) {
emit(ARM_MOV_R(tmp[0], rm), ctx);
rm = tmp[0];
Why do you need this? rd and rm can be the same for LDRS[BH].
I agree that this is not required, will remove in the next version. Will also use the suggested optimization for immediate.
- }
- switch (sz) {
- case BPF_B:
/* Load a Byte with sign extension*/
emit(ARM_LDRSB_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
break;
- case BPF_H:
/* Load a HalfWord with sign extension*/
emit(ARM_LDRSH_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
break;
- case BPF_W:
/* Load a Word*/
emit(ARM_LDR_I(rd[1], rm, off), ctx);
/* Carry the sign extension to upper 32 bits */
emit(ARM_ASR_I(rd[0], rd[1], 31), ctx);
The last instruction extending to the upper 32 bits is the same in each of these cases, so is there any reason not to do it outside the switch statement?
Will move it outside in the next version.
Thanks, Puranjay
The cpuv4 added a new BPF_MOVSX instruction that sign extends the src before moving it to the destination.
BPF_ALU | BPF_MOVSX sign extends 8-bit and 16-bit operands into 32-bit operands, and zeroes the remaining upper 32 bits.
BPF_ALU64 | BPF_MOVSX sign extends 8-bit, 16-bit, and 32-bit operands into 64-bit operands.
The offset field of the instruction is used to tell the number of bit to use for sign-extension. BPF_MOV and BPF_MOVSX have the same code but the former sets offset to 0 and the later one sets the offset to 8, 16 or 32
The behaviour of this instruction is dst = (s8,s16,s32)src
On ARM32 the implementation uses LSH and ARSH to extend the 8/16 bits to a 32-bit register and then it is sign extended to the upper 32-bit register using ARSH. For 32-bit we just move it to the destination register and use ARSH to extend it to the upper 32-bit register.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index f7c162479cf2..0a30188de660 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -747,12 +747,16 @@ static inline void emit_a32_alu_r64(const bool is64, const s8 dst[], }
/* dst = src (4 bytes)*/ -static inline void emit_a32_mov_r(const s8 dst, const s8 src, +static inline void emit_a32_mov_r(const s8 dst, const s8 src, const u8 off, struct jit_ctx *ctx) { const s8 *tmp = bpf2a32[TMP_REG_1]; s8 rt;
rt = arm_bpf_get_reg32(src, tmp[0], ctx); + if (off && off != 32) { + emit(ARM_LSL_I(rt, rt, 32 - off), ctx); + emit(ARM_ASR_I(rt, rt, 32 - off), ctx); + } arm_bpf_put_reg32(dst, rt, ctx); }
@@ -761,15 +765,15 @@ static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], const s8 src[], struct jit_ctx *ctx) { if (!is64) { - emit_a32_mov_r(dst_lo, src_lo, ctx); + emit_a32_mov_r(dst_lo, src_lo, 0, ctx); if (!ctx->prog->aux->verifier_zext) /* Zero out high 4 bytes */ emit_a32_mov_i(dst_hi, 0, ctx); } else if (__LINUX_ARM_ARCH__ < 6 && ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { /* complete 8 byte move */ - emit_a32_mov_r(dst_lo, src_lo, ctx); - emit_a32_mov_r(dst_hi, src_hi, ctx); + emit_a32_mov_r(dst_lo, src_lo, 0, ctx); + emit_a32_mov_r(dst_hi, src_hi, 0, ctx); } else if (is_stacked(src_lo) && is_stacked(dst_lo)) { const u8 *tmp = bpf2a32[TMP_REG_1];
@@ -785,6 +789,24 @@ static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], } }
+/* dst = (signed)src */ +static inline void emit_a32_movsx_r64(const bool is64, const u8 off, const s8 dst[], const s8 src[], + struct jit_ctx *ctx) { + const s8 *tmp = bpf2a32[TMP_REG_1]; + const s8 *rt; + + rt = arm_bpf_get_reg64(dst, tmp, ctx); + + emit_a32_mov_r(dst_lo, src_lo, off, ctx); + if (!is64) { + if (!ctx->prog->aux->verifier_zext) + /* Zero out high 4 bytes */ + emit_a32_mov_i(dst_hi, 0, ctx); + } else { + emit(ARM_ASR_I(rt[0], rt[1], 31), ctx); + } +} + /* Shift operations */ static inline void emit_a32_alu_i(const s8 dst, const u32 val, struct jit_ctx *ctx, const u8 op) { @@ -1445,7 +1467,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_a32_mov_i(dst_hi, 0, ctx); break; } - emit_a32_mov_r64(is64, dst, src, ctx); + if (insn->off) + emit_a32_movsx_r64(is64, insn->off, dst, src, ctx); + else + emit_a32_mov_r64(is64, dst, src, ctx); break; case BPF_K: /* Sign-extend immediate value to destination reg */
The cpuv4 added a new unconditional bswap instruction with following behaviour:
BPF_ALU64 | BPF_TO_LE | BPF_END with imm = 16/32/64 means: dst = bswap16(dst) dst = bswap32(dst) dst = bswap64(dst)
As we already support converting to big-endian from little-endian we can use the same for unconditional bswap. Since ARM32 is always little-endian, just treat the unconditional scenario the same as big-endian conversion.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 0a30188de660..09496203f13e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1632,8 +1632,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* dst = htobe(dst) */ case BPF_ALU | BPF_END | BPF_FROM_LE: case BPF_ALU | BPF_END | BPF_FROM_BE: + /* dst = bswap(dst) */ + case BPF_ALU64 | BPF_END | BPF_TO_LE: rd = arm_bpf_get_reg64(dst, tmp, ctx); - if (BPF_SRC(code) == BPF_FROM_LE) + if (BPF_SRC(code) == BPF_FROM_LE && BPF_CLASS(code) != BPF_ALU64) goto emit_bswap_uxt; switch (imm) { case 16:
On Tue, Sep 05, 2023 at 09:06:17PM +0000, Puranjay Mohan wrote:
The cpuv4 added a new unconditional bswap instruction with following behaviour:
BPF_ALU64 | BPF_TO_LE | BPF_END with imm = 16/32/64 means: dst = bswap16(dst) dst = bswap32(dst) dst = bswap64(dst)
As we already support converting to big-endian from little-endian we can use the same for unconditional bswap. Since ARM32 is always little-endian, just treat the unconditional scenario
This is not true. Arm32 BPF is disabled for BE32 but not for BE8. It's entirely possible to build a kernel using BE8 for ARMv7 and have the BPF JIT enabled:
select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
So it's not true to say "Since ARM32 is always little-endian".
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
On Tue, Sep 05, 2023 at 09:06:17PM +0000, Puranjay Mohan wrote:
The cpuv4 added a new unconditional bswap instruction with following behaviour:
BPF_ALU64 | BPF_TO_LE | BPF_END with imm = 16/32/64 means: dst = bswap16(dst) dst = bswap32(dst) dst = bswap64(dst)
As we already support converting to big-endian from little-endian we can use the same for unconditional bswap. Since ARM32 is always little-endian, just treat the unconditional scenario
This is not true. Arm32 BPF is disabled for BE32 but not for BE8. It's entirely possible to build a kernel using BE8 for ARMv7 and have the BPF JIT enabled:
select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
So it's not true to say "Since ARM32 is always little-endian".
Sure, I will change the commit message in next version.
Thanks, Puranjay
The cpuv4 added a new BPF_SDIV instruction that does signed division. The encoding is similar to BPF_DIV but BPF_SDIV sets offset=1.
ARM32 already supports 32-bit BPF_DIV which can be easily extended to support BPF_SDIV as ARM32 has the SDIV instruction. When the CPU is not ARM-v7, we implement that SDIV/SMOD with the function call similar to the implementation of DIV/MOD.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 26 ++++++++++++++++++++------ arch/arm/net/bpf_jit_32.h | 2 ++ 2 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 09496203f13e..f580ecf75710 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -228,6 +228,16 @@ static u32 jit_mod32(u32 dividend, u32 divisor) return dividend % divisor; }
+static s32 jit_sdiv32(s32 dividend, s32 divisor) +{ + return dividend / divisor; +} + +static s32 jit_smod32(s32 dividend, s32 divisor) +{ + return dividend % divisor; +} + static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) { inst |= (cond << 28); @@ -477,7 +487,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) return to - from - 2; }
-static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) +static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op, u8 sign) { const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); const s8 *tmp = bpf2a32[TMP_REG_1]; @@ -485,9 +495,10 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) #if __LINUX_ARM_ARCH__ == 7 if (elf_hwcap & HWCAP_IDIVA) { if (op == BPF_DIV) - emit(ARM_UDIV(rd, rm, rn), ctx); + sign ? emit(ARM_SDIV(rd, rm, rn), ctx) : emit(ARM_UDIV(rd, rm, rn), ctx); else { - emit(ARM_UDIV(ARM_IP, rm, rn), ctx); + sign ? emit(ARM_SDIV(ARM_IP, rm, rn), ctx) : + emit(ARM_UDIV(ARM_IP, rm, rn), ctx); emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx); } return; @@ -515,8 +526,11 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx);
/* Call appropriate function */ - emit_mov_i(ARM_IP, op == BPF_DIV ? - (u32)jit_udiv32 : (u32)jit_mod32, ctx); + if (sign) + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv32 : (u32)jit_smod32, ctx); + else + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); + emit_blx_r(ARM_IP, ctx);
/* Restore caller-saved registers from stack */ @@ -1546,7 +1560,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) rt = src_lo; break; } - emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code)); + emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code), off); arm_bpf_put_reg32(dst_lo, rd_lo, ctx); if (!ctx->prog->aux->verifier_zext) emit_a32_mov_i(dst_hi, 0, ctx); diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h index 79c7373fadce..438f0e1f91a0 100644 --- a/arch/arm/net/bpf_jit_32.h +++ b/arch/arm/net/bpf_jit_32.h @@ -139,6 +139,7 @@ #define ARM_INST_TST_I 0x03100000
#define ARM_INST_UDIV 0x0730f010 +#define ARM_INST_SDIV 0x0710f010
#define ARM_INST_UMULL 0x00800090
@@ -267,6 +268,7 @@ #define ARM_TST_I(rn, imm) _AL3_I(ARM_INST_TST, 0, rn, imm)
#define ARM_UDIV(rd, rn, rm) (ARM_INST_UDIV | (rd) << 16 | (rn) | (rm) << 8) +#define ARM_SDIV(rd, rn, rm) (ARM_INST_SDIV | (rd) << 16 | (rn) | (rm) << 8)
#define ARM_UMULL(rd_lo, rd_hi, rn, rm) (ARM_INST_UMULL | (rd_hi) << 16 \ | (rd_lo) << 12 | (rm) << 8 | rn)
On Tue, Sep 05, 2023 at 09:06:18PM +0000, Puranjay Mohan wrote:
The cpuv4 added a new BPF_SDIV instruction that does signed division. The encoding is similar to BPF_DIV but BPF_SDIV sets offset=1.
ARM32 already supports 32-bit BPF_DIV which can be easily extended to support BPF_SDIV as ARM32 has the SDIV instruction. When the CPU is not ARM-v7, we implement that SDIV/SMOD with the function call similar to the implementation of DIV/MOD.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com
arch/arm/net/bpf_jit_32.c | 26 ++++++++++++++++++++------ arch/arm/net/bpf_jit_32.h | 2 ++ 2 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 09496203f13e..f580ecf75710 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -228,6 +228,16 @@ static u32 jit_mod32(u32 dividend, u32 divisor) return dividend % divisor; } +static s32 jit_sdiv32(s32 dividend, s32 divisor) +{
- return dividend / divisor;
+}
+static s32 jit_smod32(s32 dividend, s32 divisor) +{
- return dividend % divisor;
+}
static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) { inst |= (cond << 28); @@ -477,7 +487,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) return to - from - 2; } -static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) +static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op, u8 sign) { const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); const s8 *tmp = bpf2a32[TMP_REG_1]; @@ -485,9 +495,10 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) #if __LINUX_ARM_ARCH__ == 7 if (elf_hwcap & HWCAP_IDIVA) { if (op == BPF_DIV)
emit(ARM_UDIV(rd, rm, rn), ctx);
sign ? emit(ARM_SDIV(rd, rm, rn), ctx) : emit(ARM_UDIV(rd, rm, rn), ctx);
Oh no, let's not go using the ternary operator like that. If we want to use the ternary operator, then:
emit(sign ? ARM_SDIV(rd, rm, rn) : ARM_UDIV(rd, rm, rn), ctx);
would be _much_ better, since what is actually conditional is the value passed to emit().
If we want to avoid the ternary operator altogether, then obviously if() emit() else emit(), but I'd prefer my suggestion above.
/* Call appropriate function */
- emit_mov_i(ARM_IP, op == BPF_DIV ?
(u32)jit_udiv32 : (u32)jit_mod32, ctx);
- if (sign)
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv32 : (u32)jit_smod32, ctx);
- else
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx);
u32 dst;
if (sign) { if (op == BPF_DIV) dst = (u32)jit_sdiv32; else dst = (u32)jit_smod32; } else { if (op == BPF_DIV) dst = (u32)jit_udiv32; else dst = (u32)hit_mod32; }
emit_mov_i(ARM_IP, dst, dtx);
emit_blx_r(ARM_IP, ctx); /* Restore caller-saved registers from stack */
ARM32 doesn't have instructions to do 64-bit/64-bit divisions. So, to implement the following instructions: BPF_ALU64 | BPF_DIV BPF_ALU64 | BPF_MOD BPF_ALU64 | BPF_SDIV BPF_ALU64 | BPF_SMOD
We implement the above instructions by doing function calls to div64_u64() and div64_u64_rem() for unsigned division/mod and calls to div64_s64() for signed division/mod.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- arch/arm/net/bpf_jit_32.c | 107 +++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-)
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index f580ecf75710..761056db964f 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -2,6 +2,7 @@ /* * Just-In-Time compiler for eBPF filters on 32bit ARM * + * Copyright (c) 2023 Puranjay Mohan puranjay12@gmail.com * Copyright (c) 2017 Shubham Bansal illusionist.neo@gmail.com * Copyright (c) 2011 Mircea Gherzan mgherzan@gmail.com */ @@ -15,6 +16,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/if_vlan.h> +#include <linux/math64.h>
#include <asm/cacheflush.h> #include <asm/hwcap.h> @@ -238,6 +240,34 @@ static s32 jit_smod32(s32 dividend, s32 divisor) return dividend % divisor; }
+/* Wrappers for 64-bit div/mod */ +static u64 jit_udiv64(u64 dividend, u64 divisor) +{ + return div64_u64(dividend, divisor); +} + +static u64 jit_mod64(u64 dividend, u64 divisor) +{ + u64 rem; + + div64_u64_rem(dividend, divisor, &rem); + return rem; +} + +static s64 jit_sdiv64(s64 dividend, s64 divisor) +{ + return div64_s64(dividend, divisor); +} + +static s64 jit_smod64(s64 dividend, s64 divisor) +{ + u64 q; + + q = div64_s64(dividend, divisor); + + return dividend - q * divisor; +} + static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) { inst |= (cond << 28); @@ -547,6 +577,69 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op, emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx); }
+static inline void emit_udivmod64(const s8 *rd, const s8 *rm, const s8 *rn, struct jit_ctx *ctx, + u8 op, u8 sign) +{ + /* Push caller-saved registers on stack */ + emit(ARM_PUSH(CALLER_MASK), ctx); + + /* + * As we are implementing 64-bit div/mod as function calls, We need to put the dividend in + * R0-R1 and the divisor in R2-R3. As we have already pushed these registers on the stack, + * we can recover them later after returning from the function call. + */ + if (rm[1] != ARM_R0 || rn[1] != ARM_R2) { + /* + * Move Rm to {R1, R0} if it is not already there. + */ + if (rm[1] != ARM_R0) { + if (rn[1] == ARM_R0) + emit(ARM_PUSH(BIT(ARM_R0) | BIT(ARM_R1)), ctx); + emit(ARM_MOV_R(ARM_R1, rm[0]), ctx); + emit(ARM_MOV_R(ARM_R0, rm[1]), ctx); + if (rn[1] == ARM_R0) { + emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); + goto cont; + } + } + /* + * Move Rn to {R3, R2} if it is not already there. + */ + if (rn[1] != ARM_R2) { + emit(ARM_MOV_R(ARM_R3, rn[0]), ctx); + emit(ARM_MOV_R(ARM_R2, rn[1]), ctx); + } + } + +cont: + + /* Call appropriate function */ + if (sign) + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv64 : (u32)jit_smod64, ctx); + else + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv64 : (u32)jit_mod64, ctx); + + emit_blx_r(ARM_IP, ctx); + + /* Save return value */ + if (rd[1] != ARM_R0) { + emit(ARM_MOV_R(rd[0], ARM_R1), ctx); + emit(ARM_MOV_R(rd[1], ARM_R0), ctx); + } + + /* Recover {R1, R0} from stack if it is not Rd */ + if (rd[1] != ARM_R0) + emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); + else + emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); + + /* Recover {R3, R2} from stack if it is not Rd */ + if (rd[1] != ARM_R2) + emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); + else + emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); +} + /* Is the translated BPF register on stack? */ static bool is_stacked(s8 reg) { @@ -1569,7 +1662,19 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_ALU64 | BPF_DIV | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_X: - goto notyet; + rd = arm_bpf_get_reg64(dst, tmp2, ctx); + switch (BPF_SRC(code)) { + case BPF_X: + rs = arm_bpf_get_reg64(src, tmp, ctx); + break; + case BPF_K: + rs = tmp; + emit_a32_mov_se_i64(is64, rs, imm, ctx); + break; + } + emit_udivmod64(rd, rd, rs, ctx, BPF_OP(code), off); + arm_bpf_put_reg64(dst, rd, ctx); + break; /* dst = dst << imm */ /* dst = dst >> imm */ /* dst = dst >> imm (signed) */
On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote:
+cont:
- /* Call appropriate function */
- if (sign)
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv64 : (u32)jit_smod64, ctx);
- else
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv64 : (u32)jit_mod64, ctx);
Same comment as the previous patch here.
- emit_blx_r(ARM_IP, ctx);
- /* Save return value */
- if (rd[1] != ARM_R0) {
emit(ARM_MOV_R(rd[0], ARM_R1), ctx);
emit(ARM_MOV_R(rd[1], ARM_R0), ctx);
- }
- /* Recover {R1, R0} from stack if it is not Rd */
- if (rd[1] != ARM_R0)
emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx);
- else
emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx);
- /* Recover {R3, R2} from stack if it is not Rd */
- if (rd[1] != ARM_R2)
emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx);
- else
emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx);
if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Hmm?
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote:
+cont:
- /* Call appropriate function */
- if (sign)
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv64 : (u32)jit_smod64, ctx);
- else
emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv64 : (u32)jit_mod64, ctx);
Same comment as the previous patch here.
Will fix both in next version.
- emit_blx_r(ARM_IP, ctx);
- /* Save return value */
- if (rd[1] != ARM_R0) {
emit(ARM_MOV_R(rd[0], ARM_R1), ctx);
emit(ARM_MOV_R(rd[1], ARM_R0), ctx);
- }
- /* Recover {R1, R0} from stack if it is not Rd */
- if (rd[1] != ARM_R0)
emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx);
- else
emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx);
- /* Recover {R3, R2} from stack if it is not Rd */
- if (rd[1] != ARM_R2)
emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx);
- else
emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx);
if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Hmm?
Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, so should I do it like:
if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Thanks, Puranjay
On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote:
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote:
Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, so should I do it like:
if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Are you sure all four states are possible?
On Wed, Sep 06 2023, Russell King (Oracle) wrote:
On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote:
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote:
Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, so should I do it like:
if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Are you sure all four states are possible?
ohh!
I just realized that the last else will never run. rd[1] can never be equal to both ARM_R0 and ARM_R2. Will fix it in V3 as I already sent out the V2.
I need to learn to leave patches on the list for few days before re-spinning.
Thanks, Puranjay
On Wed, Sep 06, 2023 at 07:19:50PM +0000, Puranjay Mohan wrote:
On Wed, Sep 06 2023, Russell King (Oracle) wrote:
On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote:
On Tue, Sep 05 2023, Russell King (Oracle) wrote:
On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote:
Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, so should I do it like:
if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); }
Are you sure all four states are possible?
ohh!
I just realized that the last else will never run. rd[1] can never be equal to both ARM_R0 and ARM_R2. Will fix it in V3 as I already sent out the V2.
I need to learn to leave patches on the list for few days before re-spinning.
The last comment on that is you can pop r0-r3 in one go, rather than using two instructions.
Now that all the cpuv4 instructions are supported by the arm32 JIT, enable the selftests for arm32.
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- tools/testing/selftests/bpf/progs/verifier_bswap.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_gotol.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_movsx.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_sdiv.c | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/verifier_bswap.c b/tools/testing/selftests/bpf/progs/verifier_bswap.c index 8893094725f0..d98485cd3fd8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bswap.c +++ b/tools/testing/selftests/bpf/progs/verifier_bswap.c @@ -5,7 +5,8 @@ #include "bpf_misc.h"
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) && __clang_major__ >= 18 + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || defined(__TARGET_ARCH_arm)) && \ + __clang_major__ >= 18
SEC("socket") __description("BSWAP, 16") diff --git a/tools/testing/selftests/bpf/progs/verifier_gotol.c b/tools/testing/selftests/bpf/progs/verifier_gotol.c index 2dae5322a18e..f1d76e874c67 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotol.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotol.c @@ -5,7 +5,8 @@ #include "bpf_misc.h"
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) && __clang_major__ >= 18 + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || defined(__TARGET_ARCH_arm)) && \ + __clang_major__ >= 18
SEC("socket") __description("gotol, small_imm") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 0c638f45aaf1..c712dffb90db 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -5,7 +5,8 @@ #include "bpf_misc.h"
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) && __clang_major__ >= 18 + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || defined(__TARGET_ARCH_arm)) && \ + __clang_major__ >= 18
SEC("socket") __description("LDSX, S8") diff --git a/tools/testing/selftests/bpf/progs/verifier_movsx.c b/tools/testing/selftests/bpf/progs/verifier_movsx.c index 3c8ac2c57b1b..24599b269d43 100644 --- a/tools/testing/selftests/bpf/progs/verifier_movsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_movsx.c @@ -5,7 +5,8 @@ #include "bpf_misc.h"
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) && __clang_major__ >= 18 + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || defined(__TARGET_ARCH_arm)) && \ + __clang_major__ >= 18
SEC("socket") __description("MOV32SX, S8") diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c index 0990f8825675..31a0555c429c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c +++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c @@ -5,7 +5,8 @@ #include "bpf_misc.h"
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) && __clang_major__ >= 18 + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || defined(__TARGET_ARCH_arm)) && \ + __clang_major__ >= 18
SEC("socket") __description("SDIV32, non-zero imm divisor, check 1")
The BPF JITs now support cpuv4 instructions. Add tests for these new instructions to the test suite:
1. Sign extended Load 2. Sign extended Mov 3. Unconditional byte swap 4. Unconditional jump with 32-bit offset 5. Signed division and modulo
Signed-off-by: Puranjay Mohan puranjay12@gmail.com --- include/linux/filter.h | 50 +++++- lib/test_bpf.c | 371 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 417 insertions(+), 4 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index 761af6b3cf2b..0138832ad571 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -117,21 +117,25 @@ struct ctl_table_header;
/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
-#define BPF_ALU64_IMM(OP, DST, IMM) \ +#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ - .off = 0, \ + .off = OFF, \ .imm = IMM }) +#define BPF_ALU64_IMM(OP, DST, IMM) \ + BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)
-#define BPF_ALU32_IMM(OP, DST, IMM) \ +#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ - .off = 0, \ + .off = OFF, \ .imm = IMM }) +#define BPF_ALU32_IMM(OP, DST, IMM) \ + BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)
/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
@@ -143,6 +147,16 @@ struct ctl_table_header; .off = 0, \ .imm = LEN })
+/* Byte Swap, bswap16/32/64 */ + +#define BPF_BSWAP(DST, LEN) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE), \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = LEN }) + /* Short form of mov, dst_reg = src_reg */
#define BPF_MOV64_REG(DST, SRC) \ @@ -179,6 +193,24 @@ struct ctl_table_header; .off = 0, \ .imm = IMM })
+/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ @@ -263,6 +295,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 })
+/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */ + +#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ diff --git a/lib/test_bpf.c b/lib/test_bpf.c index ecde4216201e..7916503e6a6a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5111,6 +5111,104 @@ static struct bpf_test tests[] = { { }, { { 0, 0xffffffff } } }, + /* MOVSX32 */ + { + "ALU_MOVSX | BPF_B", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x00000000ffffffefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX32_REG(R1, R3, 8), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, + { + "ALU_MOVSX | BPF_H", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x00000000ffffbeefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX32_REG(R1, R3, 16), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, + { + "ALU_MOVSX | BPF_W", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x00000000deadbeefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX32_REG(R1, R3, 32), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, + /* MOVSX64 REG */ + { + "ALU64_MOVSX | BPF_B", + .u.insns_int = { + BPF_LD_IMM64(R2, 0xffffffffffffffefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX64_REG(R1, R3, 8), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, + { + "ALU64_MOVSX | BPF_H", + .u.insns_int = { + BPF_LD_IMM64(R2, 0xffffffffffffbeefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX64_REG(R1, R3, 16), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, + { + "ALU64_MOVSX | BPF_W", + .u.insns_int = { + BPF_LD_IMM64(R2, 0xffffffffdeadbeefLL), + BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL), + BPF_MOVSX64_REG(R1, R3, 32), + BPF_JMP_REG(BPF_JEQ, R2, R1, 2), + BPF_MOV32_IMM(R0, 2), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1 } }, + }, /* BPF_ALU | BPF_ADD | BPF_X */ { "ALU_ADD_X: 1 + 2 = 3", @@ -6105,6 +6203,106 @@ static struct bpf_test tests[] = { { }, { { 0, 2 } }, }, + /* BPF_ALU | BPF_DIV | BPF_X off=1 (SDIV) */ + { + "ALU_SDIV_X: -6 / 2 = -3", + .u.insns_int = { + BPF_LD_IMM64(R0, -6), + BPF_ALU32_IMM(BPF_MOV, R1, 2), + BPF_ALU32_REG_OFF(BPF_DIV, R0, R1, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -3 } }, + }, + /* BPF_ALU | BPF_DIV | BPF_K off=1 (SDIV) */ + { + "ALU_SDIV_K: -6 / 2 = -3", + .u.insns_int = { + BPF_LD_IMM64(R0, -6), + BPF_ALU32_IMM_OFF(BPF_DIV, R0, 2, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -3 } }, + }, + /* BPF_ALU64 | BPF_DIV | BPF_X off=1 (SDIV64) */ + { + "ALU64_SDIV_X: -6 / 2 = -3", + .u.insns_int = { + BPF_LD_IMM64(R0, -6), + BPF_ALU32_IMM(BPF_MOV, R1, 2), + BPF_ALU64_REG_OFF(BPF_DIV, R0, R1, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -3 } }, + }, + /* BPF_ALU64 | BPF_DIV | BPF_K off=1 (SDIV64) */ + { + "ALU64_SDIV_K: -6 / 2 = -3", + .u.insns_int = { + BPF_LD_IMM64(R0, -6), + BPF_ALU64_IMM_OFF(BPF_DIV, R0, 2, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -3 } }, + }, + /* BPF_ALU | BPF_MOD | BPF_X off=1 (SMOD) */ + { + "ALU_SMOD_X: -7 % 2 = -1", + .u.insns_int = { + BPF_LD_IMM64(R0, -7), + BPF_ALU32_IMM(BPF_MOV, R1, 2), + BPF_ALU32_REG_OFF(BPF_MOD, R0, R1, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -1 } }, + }, + /* BPF_ALU | BPF_MOD | BPF_K off=1 (SMOD) */ + { + "ALU_SMOD_K: -7 % 2 = -1", + .u.insns_int = { + BPF_LD_IMM64(R0, -7), + BPF_ALU32_IMM_OFF(BPF_MOD, R0, 2, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -1 } }, + }, + /* BPF_ALU64 | BPF_MOD | BPF_X off=1 (SMOD64) */ + { + "ALU64_SMOD_X: -7 % 2 = -1", + .u.insns_int = { + BPF_LD_IMM64(R0, -7), + BPF_ALU32_IMM(BPF_MOV, R1, 2), + BPF_ALU64_REG_OFF(BPF_MOD, R0, R1, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -1 } }, + }, + /* BPF_ALU64 | BPF_MOD | BPF_K off=1 (SMOD64) */ + { + "ALU64_SMOD_X: -7 % 2 = -1", + .u.insns_int = { + BPF_LD_IMM64(R0, -7), + BPF_ALU64_IMM_OFF(BPF_MOD, R0, 2, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, -1 } }, + }, /* BPF_ALU | BPF_AND | BPF_X */ { "ALU_AND_X: 3 & 2 = 2", @@ -7837,6 +8035,104 @@ static struct bpf_test tests[] = { { }, { { 0, (u32) (cpu_to_le64(0xfedcba9876543210ULL) >> 32) } }, }, + /* BSWAP */ + { + "BSWAP 16: 0x0123456789abcdef -> 0xefcd", + .u.insns_int = { + BPF_LD_IMM64(R0, 0x0123456789abcdefLL), + BPF_BSWAP(R0, 16), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xefcd } }, + }, + { + "BSWAP 32: 0x0123456789abcdef -> 0xefcdab89", + .u.insns_int = { + BPF_LD_IMM64(R0, 0x0123456789abcdefLL), + BPF_BSWAP(R0, 32), + BPF_ALU64_REG(BPF_MOV, R1, R0), + BPF_ALU64_IMM(BPF_RSH, R1, 32), + BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xefcdab89 } }, + }, + { + "BSWAP 64: 0x0123456789abcdef -> 0x67452301", + .u.insns_int = { + BPF_LD_IMM64(R0, 0x0123456789abcdefLL), + BPF_BSWAP(R0, 64), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x67452301 } }, + }, + { + "BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89", + .u.insns_int = { + BPF_LD_IMM64(R0, 0x0123456789abcdefLL), + BPF_BSWAP(R0, 64), + BPF_ALU64_IMM(BPF_RSH, R0, 32), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xefcdab89 } }, + }, + /* BSWAP, reversed */ + { + "BSWAP 16: 0xfedcba9876543210 -> 0x1032", + .u.insns_int = { + BPF_LD_IMM64(R0, 0xfedcba9876543210ULL), + BPF_BSWAP(R0, 16), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x1032 } }, + }, + { + "BSWAP 32: 0xfedcba9876543210 -> 0x10325476", + .u.insns_int = { + BPF_LD_IMM64(R0, 0xfedcba9876543210ULL), + BPF_BSWAP(R0, 32), + BPF_ALU64_REG(BPF_MOV, R1, R0), + BPF_ALU64_IMM(BPF_RSH, R1, 32), + BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x10325476 } }, + }, + { + "BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe", + .u.insns_int = { + BPF_LD_IMM64(R0, 0xfedcba9876543210ULL), + BPF_BSWAP(R0, 64), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x98badcfe } }, + }, + { + "BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476", + .u.insns_int = { + BPF_LD_IMM64(R0, 0xfedcba9876543210ULL), + BPF_BSWAP(R0, 64), + BPF_ALU64_IMM(BPF_RSH, R0, 32), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x10325476 } }, + }, /* BPF_LDX_MEM B/H/W/DW */ { "BPF_LDX_MEM | BPF_B, base", @@ -8228,6 +8524,67 @@ static struct bpf_test tests[] = { { { 32, 0 } }, .stack_depth = 0, }, + /* BPF_LDX_MEMSX B/H/W */ + { + "BPF_LDX_MEMSX | BPF_B", + .u.insns_int = { + BPF_LD_IMM64(R1, 0xdead0000000000f0ULL), + BPF_LD_IMM64(R2, 0xfffffffffffffff0ULL), + BPF_STX_MEM(BPF_DW, R10, R1, -8), +#ifdef __BIG_ENDIAN + BPF_LDX_MEMSX(BPF_B, R0, R10, -1), +#else + BPF_LDX_MEMSX(BPF_B, R0, R10, -8), +#endif + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + .stack_depth = 8, + }, + { + "BPF_LDX_MEMSX | BPF_H", + .u.insns_int = { + BPF_LD_IMM64(R1, 0xdead00000000f123ULL), + BPF_LD_IMM64(R2, 0xfffffffffffff123ULL), + BPF_STX_MEM(BPF_DW, R10, R1, -8), +#ifdef __BIG_ENDIAN + BPF_LDX_MEMSX(BPF_H, R0, R10, -2), +#else + BPF_LDX_MEMSX(BPF_H, R0, R10, -8), +#endif + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + .stack_depth = 8, + }, + { + "BPF_LDX_MEMSX | BPF_W", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x00000000deadbeefULL), + BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL), + BPF_STX_MEM(BPF_DW, R10, R1, -8), +#ifdef __BIG_ENDIAN + BPF_LDX_MEMSX(BPF_W, R0, R10, -4), +#else + BPF_LDX_MEMSX(BPF_W, R0, R10, -8), +#endif + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + .stack_depth = 8, + }, /* BPF_STX_MEM B/H/W/DW */ { "BPF_STX_MEM | BPF_B", @@ -9474,6 +9831,20 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP32 | BPF_JA */ + { + "JMP32_JA: Unconditional jump: if (true) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_JMP32_IMM(BPF_JA, 0, 1, 0), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSLT | BPF_K */ { "JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
linux-kselftest-mirror@lists.linaro.org