Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: sk_msg program helper bpf_msg_push_data

This allows user to push data into a msg using sk_msg program types.
The format is as follows,

bpf_msg_push_data(msg, offset, len, flags)

this will insert 'len' bytes at offset 'offset'. For example to
prepend 10 bytes at the front of the message the user can,

bpf_msg_push_data(msg, 0, 10, 0);

This will invalidate data bounds so BPF user will have to then recheck
data bounds after calling this. After this the msg size will have been
updated and the user is free to write into the added bytes. We allow
any offset/len as long as it is within the (data, data_end) range.
However, a copy will be required if the ring is full and its possible
for the helper to fail with ENOMEM or EINVAL errors which need to be
handled by the BPF program.

This can be used similar to XDP metadata to pass data between sk_msg
layer and lower layers.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

John Fastabend and committed by
Daniel Borkmann
6fff607e 5032d079

+158 -1
+5
include/linux/skmsg.h
··· 207 207 return &msg->sg.data[which]; 208 208 } 209 209 210 + static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) 211 + { 212 + return msg->sg.data[which]; 213 + } 214 + 210 215 static inline struct page *sk_msg_page(struct sk_msg *msg, int which) 211 216 { 212 217 return sg_page(sk_msg_elem(msg, which));
+19 -1
include/uapi/linux/bpf.h
··· 2240 2240 * pointer that was returned from bpf_sk_lookup_xxx\ (). 2241 2241 * Return 2242 2242 * 0 on success, or a negative error in case of failure. 2243 + * 2244 + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) 2245 + * Description 2246 + * For socket policies, insert *len* bytes into msg at offset 2247 + * *start*. 2248 + * 2249 + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a 2250 + * *msg* it may want to insert metadata or options into the msg. 2251 + * This can later be read and used by any of the lower layer BPF 2252 + * hooks. 2253 + * 2254 + * This helper may fail if under memory pressure (a malloc 2255 + * fails) in these cases BPF programs will get an appropriate 2256 + * error and BPF programs will need to handle them. 2257 + * 2258 + * Return 2259 + * 0 on success, or a negative error in case of failure. 2243 2260 */ 2244 2261 #define __BPF_FUNC_MAPPER(FN) \ 2245 2262 FN(unspec), \ ··· 2348 2331 FN(sk_release), \ 2349 2332 FN(map_push_elem), \ 2350 2333 FN(map_pop_elem), \ 2351 - FN(map_peek_elem), 2334 + FN(map_peek_elem), \ 2335 + FN(msg_push_data), 2352 2336 2353 2337 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2354 2338 * function eBPF program intends to call
+134
net/core/filter.c
··· 2297 2297 .arg4_type = ARG_ANYTHING, 2298 2298 }; 2299 2299 2300 + BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, 2301 + u32, len, u64, flags) 2302 + { 2303 + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; 2304 + u32 new, i = 0, l, space, copy = 0, offset = 0; 2305 + u8 *raw, *to, *from; 2306 + struct page *page; 2307 + 2308 + if (unlikely(flags)) 2309 + return -EINVAL; 2310 + 2311 + /* First find the starting scatterlist element */ 2312 + i = msg->sg.start; 2313 + do { 2314 + l = sk_msg_elem(msg, i)->length; 2315 + 2316 + if (start < offset + l) 2317 + break; 2318 + offset += l; 2319 + sk_msg_iter_var_next(i); 2320 + } while (i != msg->sg.end); 2321 + 2322 + if (start >= offset + l) 2323 + return -EINVAL; 2324 + 2325 + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); 2326 + 2327 + /* If no space available will fallback to copy, we need at 2328 + * least one scatterlist elem available to push data into 2329 + * when start aligns to the beginning of an element or two 2330 + * when it falls inside an element. We handle the start equals 2331 + * offset case because its the common case for inserting a 2332 + * header. 2333 + */ 2334 + if (!space || (space == 1 && start != offset)) 2335 + copy = msg->sg.data[i].length; 2336 + 2337 + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, 2338 + get_order(copy + len)); 2339 + if (unlikely(!page)) 2340 + return -ENOMEM; 2341 + 2342 + if (copy) { 2343 + int front, back; 2344 + 2345 + raw = page_address(page); 2346 + 2347 + psge = sk_msg_elem(msg, i); 2348 + front = start - offset; 2349 + back = psge->length - front; 2350 + from = sg_virt(psge); 2351 + 2352 + if (front) 2353 + memcpy(raw, from, front); 2354 + 2355 + if (back) { 2356 + from += front; 2357 + to = raw + front + len; 2358 + 2359 + memcpy(to, from, back); 2360 + } 2361 + 2362 + put_page(sg_page(psge)); 2363 + } else if (start - offset) { 2364 + psge = sk_msg_elem(msg, i); 2365 + rsge = sk_msg_elem_cpy(msg, i); 2366 + 2367 + psge->length = start - offset; 2368 + rsge.length -= psge->length; 2369 + rsge.offset += start; 2370 + 2371 + sk_msg_iter_var_next(i); 2372 + sg_unmark_end(psge); 2373 + sk_msg_iter_next(msg, end); 2374 + } 2375 + 2376 + /* Slot(s) to place newly allocated data */ 2377 + new = i; 2378 + 2379 + /* Shift one or two slots as needed */ 2380 + if (!copy) { 2381 + sge = sk_msg_elem_cpy(msg, i); 2382 + 2383 + sk_msg_iter_var_next(i); 2384 + sg_unmark_end(&sge); 2385 + sk_msg_iter_next(msg, end); 2386 + 2387 + nsge = sk_msg_elem_cpy(msg, i); 2388 + if (rsge.length) { 2389 + sk_msg_iter_var_next(i); 2390 + nnsge = sk_msg_elem_cpy(msg, i); 2391 + } 2392 + 2393 + while (i != msg->sg.end) { 2394 + msg->sg.data[i] = sge; 2395 + sge = nsge; 2396 + sk_msg_iter_var_next(i); 2397 + if (rsge.length) { 2398 + nsge = nnsge; 2399 + nnsge = sk_msg_elem_cpy(msg, i); 2400 + } else { 2401 + nsge = sk_msg_elem_cpy(msg, i); 2402 + } 2403 + } 2404 + } 2405 + 2406 + /* Place newly allocated data buffer */ 2407 + sk_mem_charge(msg->sk, len); 2408 + msg->sg.size += len; 2409 + msg->sg.copy[new] = false; 2410 + sg_set_page(&msg->sg.data[new], page, len + copy, 0); 2411 + if (rsge.length) { 2412 + get_page(sg_page(&rsge)); 2413 + sk_msg_iter_var_next(new); 2414 + msg->sg.data[new] = rsge; 2415 + } 2416 + 2417 + sk_msg_compute_data_pointers(msg); 2418 + return 0; 2419 + } 2420 + 2421 + static const struct bpf_func_proto bpf_msg_push_data_proto = { 2422 + .func = bpf_msg_push_data, 2423 + .gpl_only = false, 2424 + .ret_type = RET_INTEGER, 2425 + .arg1_type = ARG_PTR_TO_CTX, 2426 + .arg2_type = ARG_ANYTHING, 2427 + .arg3_type = ARG_ANYTHING, 2428 + .arg4_type = ARG_ANYTHING, 2429 + }; 2430 + 2300 2431 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 2301 2432 { 2302 2433 return task_get_classid(skb); ··· 4985 4854 func == bpf_xdp_adjust_head || 4986 4855 func == bpf_xdp_adjust_meta || 4987 4856 func == bpf_msg_pull_data || 4857 + func == bpf_msg_push_data || 4988 4858 func == bpf_xdp_adjust_tail || 4989 4859 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4990 4860 func == bpf_lwt_seg6_store_bytes || ··· 5262 5130 return &bpf_msg_cork_bytes_proto; 5263 5131 case BPF_FUNC_msg_pull_data: 5264 5132 return &bpf_msg_pull_data_proto; 5133 + case BPF_FUNC_msg_push_data: 5134 + return &bpf_msg_push_data_proto; 5265 5135 case BPF_FUNC_get_local_storage: 5266 5136 return &bpf_get_local_storage_proto; 5267 5137 default: