Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: sk_msg program helper bpf_sk_msg_pull_data

Currently, if a bpf sk msg program is run the program
can only parse data that the (start,end) pointers already
consumed. For sendmsg hooks this is likely the first
scatterlist element. For sendpage this will be the range
(0,0) because the data is shared with userspace and by
default we want to avoid allowing userspace to modify
data while (or after) BPF verdict is being decided.

To support pulling in additional bytes for parsing use
a new helper bpf_sk_msg_pull(start, end, flags) which
works similar to cls tc logic. This helper will attempt
to point the data start pointer at 'start' bytes offest
into msg and data end pointer at 'end' bytes offset into
message.

After basic sanity checks to ensure 'start' <= 'end' and
'end' <= msg_length there are a few cases we need to
handle.

First the sendmsg hook has already copied the data from
userspace and has exclusive access to it. Therefor, it
is not necessesary to copy the data. However, it may
be required. After finding the scatterlist element with
'start' offset byte in it there are two cases. One the
range (start,end) is entirely contained in the sg element
and is already linear. All that is needed is to update the
data pointers, no allocate/copy is needed. The other case
is (start, end) crosses sg element boundaries. In this
case we allocate a block of size 'end - start' and copy
the data to linearize it.

Next sendpage hook has not copied any data in initial
state so that data pointers are (0,0). In this case we
handle it similar to the above sendmsg case except the
allocation/copy must always happen. Then when sending
the data we have possibly three memory regions that
need to be sent, (0, start - 1), (start, end), and
(end + 1, msg_length). This is required to ensure any
writes by the BPF program are correctly transmitted.

Lastly this operation will invalidate any previous
data checks so BPF programs will have to revalidate
pointers after making this BPF call.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

John Fastabend and committed by
Daniel Borkmann
015632bb 91843d54

+136 -2
+2 -1
include/uapi/linux/bpf.h
··· 793 793 FN(sock_ops_cb_flags_set), \ 794 794 FN(msg_redirect_map), \ 795 795 FN(msg_apply_bytes), \ 796 - FN(msg_cork_bytes), 796 + FN(msg_cork_bytes), \ 797 + FN(msg_pull_data), 797 798 798 799 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 799 800 * function eBPF program intends to call
+134 -1
net/core/filter.c
··· 1956 1956 .arg2_type = ARG_ANYTHING, 1957 1957 }; 1958 1958 1959 + BPF_CALL_4(bpf_msg_pull_data, 1960 + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) 1961 + { 1962 + unsigned int len = 0, offset = 0, copy = 0; 1963 + struct scatterlist *sg = msg->sg_data; 1964 + int first_sg, last_sg, i, shift; 1965 + unsigned char *p, *to, *from; 1966 + int bytes = end - start; 1967 + struct page *page; 1968 + 1969 + if (unlikely(flags || end <= start)) 1970 + return -EINVAL; 1971 + 1972 + /* First find the starting scatterlist element */ 1973 + i = msg->sg_start; 1974 + do { 1975 + len = sg[i].length; 1976 + offset += len; 1977 + if (start < offset + len) 1978 + break; 1979 + i++; 1980 + if (i == MAX_SKB_FRAGS) 1981 + i = 0; 1982 + } while (i != msg->sg_end); 1983 + 1984 + if (unlikely(start >= offset + len)) 1985 + return -EINVAL; 1986 + 1987 + if (!msg->sg_copy[i] && bytes <= len) 1988 + goto out; 1989 + 1990 + first_sg = i; 1991 + 1992 + /* At this point we need to linearize multiple scatterlist 1993 + * elements or a single shared page. Either way we need to 1994 + * copy into a linear buffer exclusively owned by BPF. Then 1995 + * place the buffer in the scatterlist and fixup the original 1996 + * entries by removing the entries now in the linear buffer 1997 + * and shifting the remaining entries. For now we do not try 1998 + * to copy partial entries to avoid complexity of running out 1999 + * of sg_entry slots. The downside is reading a single byte 2000 + * will copy the entire sg entry. 2001 + */ 2002 + do { 2003 + copy += sg[i].length; 2004 + i++; 2005 + if (i == MAX_SKB_FRAGS) 2006 + i = 0; 2007 + if (bytes < copy) 2008 + break; 2009 + } while (i != msg->sg_end); 2010 + last_sg = i; 2011 + 2012 + if (unlikely(copy < end - start)) 2013 + return -EINVAL; 2014 + 2015 + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); 2016 + if (unlikely(!page)) 2017 + return -ENOMEM; 2018 + p = page_address(page); 2019 + offset = 0; 2020 + 2021 + i = first_sg; 2022 + do { 2023 + from = sg_virt(&sg[i]); 2024 + len = sg[i].length; 2025 + to = p + offset; 2026 + 2027 + memcpy(to, from, len); 2028 + offset += len; 2029 + sg[i].length = 0; 2030 + put_page(sg_page(&sg[i])); 2031 + 2032 + i++; 2033 + if (i == MAX_SKB_FRAGS) 2034 + i = 0; 2035 + } while (i != last_sg); 2036 + 2037 + sg[first_sg].length = copy; 2038 + sg_set_page(&sg[first_sg], page, copy, 0); 2039 + 2040 + /* To repair sg ring we need to shift entries. If we only 2041 + * had a single entry though we can just replace it and 2042 + * be done. Otherwise walk the ring and shift the entries. 2043 + */ 2044 + shift = last_sg - first_sg - 1; 2045 + if (!shift) 2046 + goto out; 2047 + 2048 + i = first_sg + 1; 2049 + do { 2050 + int move_from; 2051 + 2052 + if (i + shift >= MAX_SKB_FRAGS) 2053 + move_from = i + shift - MAX_SKB_FRAGS; 2054 + else 2055 + move_from = i + shift; 2056 + 2057 + if (move_from == msg->sg_end) 2058 + break; 2059 + 2060 + sg[i] = sg[move_from]; 2061 + sg[move_from].length = 0; 2062 + sg[move_from].page_link = 0; 2063 + sg[move_from].offset = 0; 2064 + 2065 + i++; 2066 + if (i == MAX_SKB_FRAGS) 2067 + i = 0; 2068 + } while (1); 2069 + msg->sg_end -= shift; 2070 + if (msg->sg_end < 0) 2071 + msg->sg_end += MAX_SKB_FRAGS; 2072 + out: 2073 + msg->data = sg_virt(&sg[i]) + start - offset; 2074 + msg->data_end = msg->data + bytes; 2075 + 2076 + return 0; 2077 + } 2078 + 2079 + static const struct bpf_func_proto bpf_msg_pull_data_proto = { 2080 + .func = bpf_msg_pull_data, 2081 + .gpl_only = false, 2082 + .ret_type = RET_INTEGER, 2083 + .arg1_type = ARG_PTR_TO_CTX, 2084 + .arg2_type = ARG_ANYTHING, 2085 + .arg3_type = ARG_ANYTHING, 2086 + .arg4_type = ARG_ANYTHING, 2087 + }; 2088 + 1959 2089 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 1960 2090 { 1961 2091 return task_get_classid(skb); ··· 3027 2897 func == bpf_l3_csum_replace || 3028 2898 func == bpf_l4_csum_replace || 3029 2899 func == bpf_xdp_adjust_head || 3030 - func == bpf_xdp_adjust_meta) 2900 + func == bpf_xdp_adjust_meta || 2901 + func == bpf_msg_pull_data) 3031 2902 return true; 3032 2903 3033 2904 return false; ··· 3797 3666 return &bpf_msg_apply_bytes_proto; 3798 3667 case BPF_FUNC_msg_cork_bytes: 3799 3668 return &bpf_msg_cork_bytes_proto; 3669 + case BPF_FUNC_msg_pull_data: 3670 + return &bpf_msg_pull_data_proto; 3800 3671 default: 3801 3672 return bpf_base_func_proto(func_id); 3802 3673 }