Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm32, bpf: Reimplement sign-extension mov instruction

The current implementation of the mov instruction with sign extension has the
following problems:

1. It clobbers the source register if it is not stacked because it
sign extends the source and then moves it to the destination.
2. If the dst_reg is stacked, the current code doesn't write the value
back in case of 64-bit mov.
3. There is room for improvement by emitting fewer instructions.

The steps for fixing this and the instructions emitted by the JIT are explained
below with examples in all combinations:

Case A: offset == 32:
=====================

Case A.1: src and dst are stacked registers:
--------------------------------------------
1. Load src_lo into tmp_lo
2. Store tmp_lo into dst_lo
3. Sign extend tmp_lo into tmp_hi
4. Store tmp_hi to dst_hi

Example: r3 = (s32)r3
r3 is a stacked register

ldr r6, [r11, #-16] // Load r3_lo into tmp_lo
// str to dst_lo is not emitted because src_lo == dst_lo
asr r7, r6, #31 // Sign extend tmp_lo into tmp_hi
str r7, [r11, #-12] // Store tmp_hi into r3_hi

Case A.2: src is stacked but dst is not:
----------------------------------------
1. Load src_lo into dst_lo
2. Sign extend dst_lo into dst_hi

Example: r6 = (s32)r3
r6 maps to {ARM_R5, ARM_R4} and r3 is stacked

ldr r4, [r11, #-16] // Load r3_lo into r6_lo
asr r5, r4, #31 // Sign extend r6_lo into r6_hi

Case A.3: src is not stacked but dst is stacked:
------------------------------------------------
1. Store src_lo into dst_lo
2. Sign extend src_lo into tmp_hi
3. Store tmp_hi to dst_hi

Example: r3 = (s32)r6
r3 is stacked and r6 maps to {ARM_R5, ARM_R4}

str r4, [r11, #-16] // Store r6_lo to r3_lo
asr r7, r4, #31 // Sign extend r6_lo into tmp_hi
str r7, [r11, #-12] // Store tmp_hi to dest_hi

Case A.4: Both src and dst are not stacked:
-------------------------------------------
1. Mov src_lo into dst_lo
2. Sign extend src_lo into dst_hi

Example: (bf) r6 = (s32)r6
r6 maps to {ARM_R5, ARM_R4}

// Mov not emitted because dst == src
asr r5, r4, #31 // Sign extend r6_lo into r6_hi

Case B: offset != 32:
=====================

Case B.1: src and dst are stacked registers:
--------------------------------------------
1. Load src_lo into tmp_lo
2. Sign extend tmp_lo according to offset.
3. Store tmp_lo into dst_lo
4. Sign extend tmp_lo into tmp_hi
5. Store tmp_hi to dst_hi

Example: r9 = (s8)r3
r9 and r3 are both stacked registers

ldr r6, [r11, #-16] // Load r3_lo into tmp_lo
lsl r6, r6, #24 // Sign extend tmp_lo
asr r6, r6, #24 // ..
str r6, [r11, #-56] // Store tmp_lo to r9_lo
asr r7, r6, #31 // Sign extend tmp_lo to tmp_hi
str r7, [r11, #-52] // Store tmp_hi to r9_hi

Case B.2: src is stacked but dst is not:
----------------------------------------
1. Load src_lo into dst_lo
2. Sign extend dst_lo according to offset.
3. Sign extend tmp_lo into dst_hi

Example: r6 = (s8)r3
r6 maps to {ARM_R5, ARM_R4} and r3 is stacked

ldr r4, [r11, #-16] // Load r3_lo to r6_lo
lsl r4, r4, #24 // Sign extend r6_lo
asr r4, r4, #24 // ..
asr r5, r4, #31 // Sign extend r6_lo into r6_hi

Case B.3: src is not stacked but dst is stacked:
------------------------------------------------
1. Sign extend src_lo into tmp_lo according to offset.
2. Store tmp_lo into dst_lo.
3. Sign extend src_lo into tmp_hi.
4. Store tmp_hi to dst_hi.

Example: r3 = (s8)r1
r3 is stacked and r1 maps to {ARM_R3, ARM_R2}

lsl r6, r2, #24 // Sign extend r1_lo to tmp_lo
asr r6, r6, #24 // ..
str r6, [r11, #-16] // Store tmp_lo to r3_lo
asr r7, r6, #31 // Sign extend tmp_lo to tmp_hi
str r7, [r11, #-12] // Store tmp_hi to r3_hi

Case B.4: Both src and dst are not stacked:
-------------------------------------------
1. Sign extend src_lo into dst_lo according to offset.
2. Sign extend dst_lo into dst_hi.

Example: r6 = (s8)r1
r6 maps to {ARM_R5, ARM_R4} and r1 maps to {ARM_R3, ARM_R2}

lsl r4, r2, #24 // Sign extend r1_lo to r6_lo
asr r4, r4, #24 // ..
asr r5, r4, #31 // Sign extend r6_lo to r6_hi

Fixes: fc832653fa0d ("arm32, bpf: add support for sign-extension mov instruction")
Reported-by: syzbot+186522670e6722692d86@syzkaller.appspotmail.com
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Closes: https://lore.kernel.org/all/000000000000e9a8d80615163f2a@google.com
Link: https://lore.kernel.org/bpf/20240419182832.27707-1-puranjay@kernel.org

authored by

Puranjay Mohan and committed by
Daniel Borkmann
c6f48506 10541b37

+43 -13
+43 -13
arch/arm/net/bpf_jit_32.c
··· 871 871 } 872 872 873 873 /* dst = src (4 bytes)*/ 874 - static inline void emit_a32_mov_r(const s8 dst, const s8 src, const u8 off, 875 - struct jit_ctx *ctx) { 874 + static inline void emit_a32_mov_r(const s8 dst, const s8 src, struct jit_ctx *ctx) { 876 875 const s8 *tmp = bpf2a32[TMP_REG_1]; 877 876 s8 rt; 878 877 879 878 rt = arm_bpf_get_reg32(src, tmp[0], ctx); 880 - if (off && off != 32) { 881 - emit(ARM_LSL_I(rt, rt, 32 - off), ctx); 882 - emit(ARM_ASR_I(rt, rt, 32 - off), ctx); 883 - } 884 879 arm_bpf_put_reg32(dst, rt, ctx); 885 880 } 886 881 ··· 884 889 const s8 src[], 885 890 struct jit_ctx *ctx) { 886 891 if (!is64) { 887 - emit_a32_mov_r(dst_lo, src_lo, 0, ctx); 892 + emit_a32_mov_r(dst_lo, src_lo, ctx); 888 893 if (!ctx->prog->aux->verifier_zext) 889 894 /* Zero out high 4 bytes */ 890 895 emit_a32_mov_i(dst_hi, 0, ctx); 891 896 } else if (__LINUX_ARM_ARCH__ < 6 && 892 897 ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { 893 898 /* complete 8 byte move */ 894 - emit_a32_mov_r(dst_lo, src_lo, 0, ctx); 895 - emit_a32_mov_r(dst_hi, src_hi, 0, ctx); 899 + emit_a32_mov_r(dst_lo, src_lo, ctx); 900 + emit_a32_mov_r(dst_hi, src_hi, ctx); 896 901 } else if (is_stacked(src_lo) && is_stacked(dst_lo)) { 897 902 const u8 *tmp = bpf2a32[TMP_REG_1]; 898 903 ··· 912 917 static inline void emit_a32_movsx_r64(const bool is64, const u8 off, const s8 dst[], const s8 src[], 913 918 struct jit_ctx *ctx) { 914 919 const s8 *tmp = bpf2a32[TMP_REG_1]; 915 - const s8 *rt; 920 + s8 rs; 921 + s8 rd; 916 922 917 - rt = arm_bpf_get_reg64(dst, tmp, ctx); 923 + if (is_stacked(dst_lo)) 924 + rd = tmp[1]; 925 + else 926 + rd = dst_lo; 927 + rs = arm_bpf_get_reg32(src_lo, rd, ctx); 928 + /* rs may be one of src[1], dst[1], or tmp[1] */ 918 929 919 - emit_a32_mov_r(dst_lo, src_lo, off, ctx); 930 + /* Sign extend rs if needed. If off == 32, lower 32-bits of src are moved to dst and sign 931 + * extension only happens in the upper 64 bits. 932 + */ 933 + if (off != 32) { 934 + /* Sign extend rs into rd */ 935 + emit(ARM_LSL_I(rd, rs, 32 - off), ctx); 936 + emit(ARM_ASR_I(rd, rd, 32 - off), ctx); 937 + } else { 938 + rd = rs; 939 + } 940 + 941 + /* Write rd to dst_lo 942 + * 943 + * Optimization: 944 + * Assume: 945 + * 1. dst == src and stacked. 946 + * 2. off == 32 947 + * 948 + * In this case src_lo was loaded into rd(tmp[1]) but rd was not sign extended as off==32. 949 + * So, we don't need to write rd back to dst_lo as they have the same value. 950 + * This saves us one str instruction. 951 + */ 952 + if (dst_lo != src_lo || off != 32) 953 + arm_bpf_put_reg32(dst_lo, rd, ctx); 954 + 920 955 if (!is64) { 921 956 if (!ctx->prog->aux->verifier_zext) 922 957 /* Zero out high 4 bytes */ 923 958 emit_a32_mov_i(dst_hi, 0, ctx); 924 959 } else { 925 - emit(ARM_ASR_I(rt[0], rt[1], 31), ctx); 960 + if (is_stacked(dst_hi)) { 961 + emit(ARM_ASR_I(tmp[0], rd, 31), ctx); 962 + arm_bpf_put_reg32(dst_hi, tmp[0], ctx); 963 + } else { 964 + emit(ARM_ASR_I(dst_hi, rd, 31), ctx); 965 + } 926 966 } 927 967 } 928 968