Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf, x86: allow function arguments up to 12 for TRACING

For now, the BPF program of type BPF_PROG_TYPE_TRACING can only be used
on the kernel functions whose arguments count less than or equal to 6, if
not considering '> 8 bytes' struct argument. This is not friendly at all,
as too many functions have arguments count more than 6.

According to the current kernel version, below is a statistics of the
function arguments count:

argument count | function count
7 | 704
8 | 270
9 | 84
10 | 47
11 | 47
12 | 27
13 | 22
14 | 5
15 | 0
16 | 1

Therefore, let's enhance it by increasing the function arguments count
allowed in arch_prepare_bpf_trampoline(), for now, only x86_64.

For the case that we don't need to call origin function, which means
without BPF_TRAMP_F_CALL_ORIG, we need only copy the function arguments
that stored in the frame of the caller to current frame. The 7th and later
arguments are stored in "$rbp + 0x18", and they will be copied to the
stack area following where register values are saved.

For the case with BPF_TRAMP_F_CALL_ORIG, we need prepare the arguments
in stack before call origin function, which means we need alloc extra
"8 * (arg_count - 6)" memory in the top of the stack. Note, there should
not be any data be pushed to the stack before calling the origin function.
So 'rbx' value will be stored on a stack position higher than where stack
arguments are stored for BPF_TRAMP_F_CALL_ORIG.

According to the research of Yonghong, struct members should be all in
register or all on the stack. Meanwhile, the compiler will pass the
argument on regs if the remaining regs can hold the argument. Therefore,
we need save the arguments in order. Otherwise, disorder of the args can
happen. For example:

struct foo_struct {
long a;
int b;
};
int foo(char, char, char, char, char, struct foo_struct,
char);

the arg1-5,arg7 will be passed by regs, and arg6 will by stack. Therefore,
we should save/restore the arguments in the same order with the
declaration of foo(). And the args used as ctx in stack will be like this:

reg_arg6 -- copy from regs
stack_arg2 -- copy from stack
stack_arg1
reg_arg5 -- copy from regs
reg_arg4
reg_arg3
reg_arg2
reg_arg1

We use EMIT3_off32() or EMIT4() for "lea" and "sub". The range of the
imm in "lea" and "sub" is [-128, 127] if EMIT4() is used. Therefore,
we use EMIT3_off32() instead if the imm out of the range.

It works well for the FENTRY/FEXIT/MODIFY_RETURN.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230713040738.1789742-3-imagedong@tencent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Menglong Dong and committed by
Alexei Starovoitov
473e3150 02a6dfa8

+209 -26
+209 -26
arch/x86/net/bpf_jit_comp.c
··· 1857 1857 return proglen; 1858 1858 } 1859 1859 1860 - static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, 1861 - int stack_size) 1860 + static void clean_stack_garbage(const struct btf_func_model *m, 1861 + u8 **pprog, int nr_stack_slots, 1862 + int stack_size) 1862 1863 { 1863 - int i; 1864 + int arg_size, off; 1865 + u8 *prog; 1866 + 1867 + /* Generally speaking, the compiler will pass the arguments 1868 + * on-stack with "push" instruction, which will take 8-byte 1869 + * on the stack. In this case, there won't be garbage values 1870 + * while we copy the arguments from origin stack frame to current 1871 + * in BPF_DW. 1872 + * 1873 + * However, sometimes the compiler will only allocate 4-byte on 1874 + * the stack for the arguments. For now, this case will only 1875 + * happen if there is only one argument on-stack and its size 1876 + * not more than 4 byte. In this case, there will be garbage 1877 + * values on the upper 4-byte where we store the argument on 1878 + * current stack frame. 1879 + * 1880 + * arguments on origin stack: 1881 + * 1882 + * stack_arg_1(4-byte) xxx(4-byte) 1883 + * 1884 + * what we copy: 1885 + * 1886 + * stack_arg_1(8-byte): stack_arg_1(origin) xxx 1887 + * 1888 + * and the xxx is the garbage values which we should clean here. 1889 + */ 1890 + if (nr_stack_slots != 1) 1891 + return; 1892 + 1893 + /* the size of the last argument */ 1894 + arg_size = m->arg_size[m->nr_args - 1]; 1895 + if (arg_size <= 4) { 1896 + off = -(stack_size - 4); 1897 + prog = *pprog; 1898 + /* mov DWORD PTR [rbp + off], 0 */ 1899 + if (!is_imm8(off)) 1900 + EMIT2_off32(0xC7, 0x85, off); 1901 + else 1902 + EMIT3(0xC7, 0x45, off); 1903 + EMIT(0, 4); 1904 + *pprog = prog; 1905 + } 1906 + } 1907 + 1908 + /* get the count of the regs that are used to pass arguments */ 1909 + static int get_nr_used_regs(const struct btf_func_model *m) 1910 + { 1911 + int i, arg_regs, nr_used_regs = 0; 1912 + 1913 + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { 1914 + arg_regs = (m->arg_size[i] + 7) / 8; 1915 + if (nr_used_regs + arg_regs <= 6) 1916 + nr_used_regs += arg_regs; 1917 + 1918 + if (nr_used_regs >= 6) 1919 + break; 1920 + } 1921 + 1922 + return nr_used_regs; 1923 + } 1924 + 1925 + static void save_args(const struct btf_func_model *m, u8 **prog, 1926 + int stack_size, bool for_call_origin) 1927 + { 1928 + int arg_regs, first_off, nr_regs = 0, nr_stack_slots = 0; 1929 + int i, j; 1864 1930 1865 1931 /* Store function arguments to stack. 1866 1932 * For a function that accepts two pointers the sequence will be: 1867 1933 * mov QWORD PTR [rbp-0x10],rdi 1868 1934 * mov QWORD PTR [rbp-0x8],rsi 1869 1935 */ 1870 - for (i = 0; i < min(nr_regs, 6); i++) 1871 - emit_stx(prog, BPF_DW, BPF_REG_FP, 1872 - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, 1873 - -(stack_size - i * 8)); 1936 + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { 1937 + arg_regs = (m->arg_size[i] + 7) / 8; 1938 + 1939 + /* According to the research of Yonghong, struct members 1940 + * should be all in register or all on the stack. 1941 + * Meanwhile, the compiler will pass the argument on regs 1942 + * if the remaining regs can hold the argument. 1943 + * 1944 + * Disorder of the args can happen. For example: 1945 + * 1946 + * struct foo_struct { 1947 + * long a; 1948 + * int b; 1949 + * }; 1950 + * int foo(char, char, char, char, char, struct foo_struct, 1951 + * char); 1952 + * 1953 + * the arg1-5,arg7 will be passed by regs, and arg6 will 1954 + * by stack. 1955 + */ 1956 + if (nr_regs + arg_regs > 6) { 1957 + /* copy function arguments from origin stack frame 1958 + * into current stack frame. 1959 + * 1960 + * The starting address of the arguments on-stack 1961 + * is: 1962 + * rbp + 8(push rbp) + 1963 + * 8(return addr of origin call) + 1964 + * 8(return addr of the caller) 1965 + * which means: rbp + 24 1966 + */ 1967 + for (j = 0; j < arg_regs; j++) { 1968 + emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 1969 + nr_stack_slots * 8 + 0x18); 1970 + emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0, 1971 + -stack_size); 1972 + 1973 + if (!nr_stack_slots) 1974 + first_off = stack_size; 1975 + stack_size -= 8; 1976 + nr_stack_slots++; 1977 + } 1978 + } else { 1979 + /* Only copy the arguments on-stack to current 1980 + * 'stack_size' and ignore the regs, used to 1981 + * prepare the arguments on-stack for orign call. 1982 + */ 1983 + if (for_call_origin) { 1984 + nr_regs += arg_regs; 1985 + continue; 1986 + } 1987 + 1988 + /* copy the arguments from regs into stack */ 1989 + for (j = 0; j < arg_regs; j++) { 1990 + emit_stx(prog, BPF_DW, BPF_REG_FP, 1991 + nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs, 1992 + -stack_size); 1993 + stack_size -= 8; 1994 + nr_regs++; 1995 + } 1996 + } 1997 + } 1998 + 1999 + clean_stack_garbage(m, prog, nr_stack_slots, first_off); 1874 2000 } 1875 2001 1876 - static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, 2002 + static void restore_regs(const struct btf_func_model *m, u8 **prog, 1877 2003 int stack_size) 1878 2004 { 1879 - int i; 2005 + int i, j, arg_regs, nr_regs = 0; 1880 2006 1881 2007 /* Restore function arguments from stack. 1882 2008 * For a function that accepts two pointers the sequence will be: 1883 2009 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] 1884 2010 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] 2011 + * 2012 + * The logic here is similar to what we do in save_args() 1885 2013 */ 1886 - for (i = 0; i < min(nr_regs, 6); i++) 1887 - emit_ldx(prog, BPF_DW, 1888 - i == 5 ? X86_REG_R9 : BPF_REG_1 + i, 1889 - BPF_REG_FP, 1890 - -(stack_size - i * 8)); 2014 + for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) { 2015 + arg_regs = (m->arg_size[i] + 7) / 8; 2016 + if (nr_regs + arg_regs <= 6) { 2017 + for (j = 0; j < arg_regs; j++) { 2018 + emit_ldx(prog, BPF_DW, 2019 + nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs, 2020 + BPF_REG_FP, 2021 + -stack_size); 2022 + stack_size -= 8; 2023 + nr_regs++; 2024 + } 2025 + } else { 2026 + stack_size -= 8 * arg_regs; 2027 + } 2028 + 2029 + if (nr_regs >= 6) 2030 + break; 2031 + } 1891 2032 } 1892 2033 1893 2034 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, ··· 2056 1915 /* arg1: mov rdi, progs[i] */ 2057 1916 emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); 2058 1917 /* arg2: lea rsi, [rbp - ctx_cookie_off] */ 2059 - EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); 1918 + if (!is_imm8(-run_ctx_off)) 1919 + EMIT3_off32(0x48, 0x8D, 0xB5, -run_ctx_off); 1920 + else 1921 + EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); 2060 1922 2061 1923 if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog)) 2062 1924 return -EINVAL; ··· 2075 1931 emit_nops(&prog, 2); 2076 1932 2077 1933 /* arg1: lea rdi, [rbp - stack_size] */ 2078 - EMIT4(0x48, 0x8D, 0x7D, -stack_size); 1934 + if (!is_imm8(-stack_size)) 1935 + EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size); 1936 + else 1937 + EMIT4(0x48, 0x8D, 0x7D, -stack_size); 2079 1938 /* arg2: progs[i]->insnsi for interpreter */ 2080 1939 if (!p->jited) 2081 1940 emit_mov_imm64(&prog, BPF_REG_2, ··· 2108 1961 /* arg2: mov rsi, rbx <- start time in nsec */ 2109 1962 emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); 2110 1963 /* arg3: lea rdx, [rbp - run_ctx_off] */ 2111 - EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); 1964 + if (!is_imm8(-run_ctx_off)) 1965 + EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off); 1966 + else 1967 + EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); 2112 1968 if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog)) 2113 1969 return -EINVAL; 2114 1970 ··· 2263 2113 void *func_addr) 2264 2114 { 2265 2115 int i, ret, nr_regs = m->nr_args, stack_size = 0; 2266 - int regs_off, nregs_off, ip_off, run_ctx_off; 2116 + int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; 2267 2117 struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; 2268 2118 struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; 2269 2119 struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; ··· 2277 2127 if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) 2278 2128 nr_regs += (m->arg_size[i] + 7) / 8 - 1; 2279 2129 2280 - /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ 2281 - if (nr_regs > 6) 2130 + /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6 2131 + * are passed through regs, the remains are through stack. 2132 + */ 2133 + if (nr_regs > MAX_BPF_FUNC_ARGS) 2282 2134 return -ENOTSUPP; 2283 2135 2284 2136 /* Generated trampoline stack layout: ··· 2299 2147 * 2300 2148 * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag 2301 2149 * 2150 + * RBP - rbx_off [ rbx value ] always 2151 + * 2302 2152 * RBP - run_ctx_off [ bpf_tramp_run_ctx ] 2153 + * 2154 + * [ stack_argN ] BPF_TRAMP_F_CALL_ORIG 2155 + * [ ... ] 2156 + * [ stack_arg2 ] 2157 + * RBP - arg_stack_off [ stack_arg1 ] 2303 2158 */ 2304 2159 2305 2160 /* room for return value of orig_call or fentry prog */ ··· 2326 2167 2327 2168 ip_off = stack_size; 2328 2169 2170 + stack_size += 8; 2171 + rbx_off = stack_size; 2172 + 2329 2173 stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7; 2330 2174 run_ctx_off = stack_size; 2175 + 2176 + if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) { 2177 + /* the space that used to pass arguments on-stack */ 2178 + stack_size += (nr_regs - get_nr_used_regs(m)) * 8; 2179 + /* make sure the stack pointer is 16-byte aligned if we 2180 + * need pass arguments on stack, which means 2181 + * [stack_size + 8(rbp) + 8(rip) + 8(origin rip)] 2182 + * should be 16-byte aligned. Following code depend on 2183 + * that stack_size is already 8-byte aligned. 2184 + */ 2185 + stack_size += (stack_size % 16) ? 0 : 8; 2186 + } 2187 + 2188 + arg_stack_off = stack_size; 2331 2189 2332 2190 if (flags & BPF_TRAMP_F_SKIP_FRAME) { 2333 2191 /* skip patched call instruction and point orig_call to actual ··· 2365 2189 x86_call_depth_emit_accounting(&prog, NULL); 2366 2190 EMIT1(0x55); /* push rbp */ 2367 2191 EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ 2368 - EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ 2369 - EMIT1(0x53); /* push rbx */ 2192 + if (!is_imm8(stack_size)) 2193 + /* sub rsp, stack_size */ 2194 + EMIT3_off32(0x48, 0x81, 0xEC, stack_size); 2195 + else 2196 + /* sub rsp, stack_size */ 2197 + EMIT4(0x48, 0x83, 0xEC, stack_size); 2198 + /* mov QWORD PTR [rbp - rbx_off], rbx */ 2199 + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); 2370 2200 2371 2201 /* Store number of argument registers of the traced function: 2372 2202 * mov rax, nr_regs ··· 2390 2208 emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); 2391 2209 } 2392 2210 2393 - save_regs(m, &prog, nr_regs, regs_off); 2211 + save_args(m, &prog, regs_off, false); 2394 2212 2395 2213 if (flags & BPF_TRAMP_F_CALL_ORIG) { 2396 2214 /* arg1: mov rdi, im */ ··· 2420 2238 } 2421 2239 2422 2240 if (flags & BPF_TRAMP_F_CALL_ORIG) { 2423 - restore_regs(m, &prog, nr_regs, regs_off); 2241 + restore_regs(m, &prog, regs_off); 2242 + save_args(m, &prog, arg_stack_off, true); 2424 2243 2425 2244 if (flags & BPF_TRAMP_F_ORIG_STACK) { 2426 2245 emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); ··· 2462 2279 } 2463 2280 2464 2281 if (flags & BPF_TRAMP_F_RESTORE_REGS) 2465 - restore_regs(m, &prog, nr_regs, regs_off); 2282 + restore_regs(m, &prog, regs_off); 2466 2283 2467 2284 /* This needs to be done regardless. If there were fmod_ret programs, 2468 2285 * the return value is only updated on the stack and still needs to be ··· 2481 2298 if (save_ret) 2482 2299 emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); 2483 2300 2484 - EMIT1(0x5B); /* pop rbx */ 2301 + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); 2485 2302 EMIT1(0xC9); /* leave */ 2486 2303 if (flags & BPF_TRAMP_F_SKIP_FRAME) 2487 2304 /* skip our return address and return to parent */