Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: add support for skbs with unreadable frags

For device memory TCP, we expect the skb headers to be available in host
memory for access, and we expect the skb frags to be in device memory
and unaccessible to the host. We expect there to be no mixing and
matching of device memory frags (unaccessible) with host memory frags
(accessible) in the same skb.

Add a skb->devmem flag which indicates whether the frags in this skb
are device memory frags or not.

__skb_fill_netmem_desc() now checks frags added to skbs for net_iov,
and marks the skb as skb->devmem accordingly.

Add checks through the network stack to avoid accessing the frags of
devmem skbs and avoid coalescing devmem skbs with non devmem skbs.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-9-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Mina Almasry and committed by
Jakub Kicinski
65249feb 9f6b619e

+89 -11
+17 -2
include/linux/skbuff.h
··· 827 827 * @csum_level: indicates the number of consecutive checksums found in 828 828 * the packet minus one that have been verified as 829 829 * CHECKSUM_UNNECESSARY (max 3) 830 + * @unreadable: indicates that at least 1 of the fragments in this skb is 831 + * unreadable. 830 832 * @dst_pending_confirm: need to confirm neighbour 831 833 * @decrypted: Decrypted SKB 832 834 * @slow_gro: state present at GRO time, slower prepare step required ··· 1010 1008 #if IS_ENABLED(CONFIG_IP_SCTP) 1011 1009 __u8 csum_not_inet:1; 1012 1010 #endif 1013 - 1011 + __u8 unreadable:1; 1014 1012 #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) 1015 1013 __u16 tc_index; /* traffic control index */ 1016 1014 #endif ··· 1826 1824 __skb_zcopy_downgrade_managed(skb); 1827 1825 } 1828 1826 1827 + /* Return true if frags in this skb are readable by the host. */ 1828 + static inline bool skb_frags_readable(const struct sk_buff *skb) 1829 + { 1830 + return !skb->unreadable; 1831 + } 1832 + 1829 1833 static inline void skb_mark_not_on_list(struct sk_buff *skb) 1830 1834 { 1831 1835 skb->next = NULL; ··· 2548 2540 static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, 2549 2541 netmem_ref netmem, int off, int size) 2550 2542 { 2551 - struct page *page = netmem_to_page(netmem); 2543 + struct page *page; 2552 2544 2553 2545 __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size); 2546 + 2547 + if (netmem_is_net_iov(netmem)) { 2548 + skb->unreadable = true; 2549 + return; 2550 + } 2551 + 2552 + page = netmem_to_page(netmem); 2554 2553 2555 2554 /* Propagate page pfmemalloc to the skb if we can. The problem is 2556 2555 * that not all callers have unique ownership of the page but rely
+2 -1
include/net/tcp.h
··· 1069 1069 /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ 1070 1070 return likely(tcp_skb_can_collapse_to(to) && 1071 1071 mptcp_skb_can_collapse(to, from) && 1072 - skb_pure_zcopy_same(to, from)); 1072 + skb_pure_zcopy_same(to, from) && 1073 + skb_frags_readable(to) == skb_frags_readable(from)); 1073 1074 } 1074 1075 1075 1076 static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,
+6
net/core/datagram.c
··· 407 407 return 0; 408 408 } 409 409 410 + if (!skb_frags_readable(skb)) 411 + goto short_copy; 412 + 410 413 /* Copy paged appendix. Hmm... why does this look so complicated? */ 411 414 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 412 415 int end; ··· 625 622 struct iov_iter *from, size_t length) 626 623 { 627 624 int frag = skb_shinfo(skb)->nr_frags; 625 + 626 + if (!skb_frags_readable(skb)) 627 + return -EFAULT; 628 628 629 629 while (length && iov_iter_count(from)) { 630 630 struct page *head, *last_head = NULL;
+4
net/core/dev.c
··· 3312 3312 return -EINVAL; 3313 3313 } 3314 3314 3315 + if (!skb_frags_readable(skb)) { 3316 + return -EFAULT; 3317 + } 3318 + 3315 3319 /* Before computing a checksum, we should make sure no frag could 3316 3320 * be modified by an external entity : checksum could be wrong. 3317 3321 */
+41 -2
net/core/skbuff.c
··· 1972 1972 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1973 1973 return -EINVAL; 1974 1974 1975 + if (!skb_frags_readable(skb)) 1976 + return -EFAULT; 1977 + 1975 1978 if (!num_frags) 1976 1979 goto release; 1977 1980 ··· 2147 2144 struct sk_buff *n; 2148 2145 unsigned int size; 2149 2146 int headerlen; 2147 + 2148 + if (!skb_frags_readable(skb)) 2149 + return NULL; 2150 2150 2151 2151 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2152 2152 return NULL; ··· 2488 2482 int head_copy_len, head_copy_off; 2489 2483 struct sk_buff *n; 2490 2484 int oldheadroom; 2485 + 2486 + if (!skb_frags_readable(skb)) 2487 + return NULL; 2491 2488 2492 2489 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2493 2490 return NULL; ··· 2836 2827 */ 2837 2828 int i, k, eat = (skb->tail + delta) - skb->end; 2838 2829 2830 + if (!skb_frags_readable(skb)) 2831 + return NULL; 2832 + 2839 2833 if (eat > 0 || skb_cloned(skb)) { 2840 2834 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2841 2835 GFP_ATOMIC)) ··· 2991 2979 offset += copy; 2992 2980 to += copy; 2993 2981 } 2982 + 2983 + if (!skb_frags_readable(skb)) 2984 + goto fault; 2994 2985 2995 2986 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2996 2987 int end; ··· 3183 3168 /* 3184 3169 * then map the fragments 3185 3170 */ 3171 + if (!skb_frags_readable(skb)) 3172 + return false; 3173 + 3186 3174 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 3187 3175 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 3188 3176 ··· 3409 3391 from += copy; 3410 3392 } 3411 3393 3394 + if (!skb_frags_readable(skb)) 3395 + goto fault; 3396 + 3412 3397 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3413 3398 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3414 3399 int end; ··· 3490 3469 offset += copy; 3491 3470 pos = copy; 3492 3471 } 3472 + 3473 + if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3474 + return 0; 3493 3475 3494 3476 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3495 3477 int end; ··· 3593 3569 to += copy; 3594 3570 pos = copy; 3595 3571 } 3572 + 3573 + if (!skb_frags_readable(skb)) 3574 + return 0; 3596 3575 3597 3576 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3598 3577 int end; ··· 4088 4061 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 4089 4062 4090 4063 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 4064 + skb1->unreadable = skb->unreadable; 4091 4065 skb_shinfo(skb)->nr_frags = 0; 4092 4066 skb1->data_len = skb->data_len; 4093 4067 skb1->len += skb1->data_len; ··· 4136 4108 pos += size; 4137 4109 } 4138 4110 skb_shinfo(skb1)->nr_frags = k; 4111 + 4112 + skb1->unreadable = skb->unreadable; 4139 4113 } 4140 4114 4141 4115 /** ··· 4374 4344 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 4375 4345 return block_limit - abs_offset; 4376 4346 } 4347 + 4348 + if (!skb_frags_readable(st->cur_skb)) 4349 + return 0; 4377 4350 4378 4351 if (st->frag_idx == 0 && !st->frag_data) 4379 4352 st->stepped_offset += skb_headlen(st->cur_skb); ··· 6025 5992 if (to->pp_recycle != from->pp_recycle) 6026 5993 return false; 6027 5994 6028 - if (len <= skb_tailroom(to)) { 5995 + if (skb_frags_readable(from) != skb_frags_readable(to)) 5996 + return false; 5997 + 5998 + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { 6029 5999 if (len) 6030 6000 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 6031 6001 *delta_truesize = 0; ··· 6204 6168 { 6205 6169 if (!pskb_may_pull(skb, write_len)) 6206 6170 return -ENOMEM; 6171 + 6172 + if (!skb_frags_readable(skb)) 6173 + return -EFAULT; 6207 6174 6208 6175 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 6209 6176 return 0; ··· 6887 6848 { 6888 6849 if (skb->data_len) { 6889 6850 if (skb->data_len > skb->end - skb->tail || 6890 - skb_cloned(skb)) 6851 + skb_cloned(skb) || !skb_frags_readable(skb)) 6891 6852 return; 6892 6853 6893 6854 /* Nice, we can free page frag(s) right now */
+3
net/ipv4/tcp.c
··· 2160 2160 skb = tcp_recv_skb(sk, seq, &offset); 2161 2161 } 2162 2162 2163 + if (!skb_frags_readable(skb)) 2164 + break; 2165 + 2163 2166 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2164 2167 tcp_update_recv_tstamps(skb, tss); 2165 2168 zc->msg_flags |= TCP_CMSG_TS;
+10 -3
net/ipv4/tcp_input.c
··· 5391 5391 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { 5392 5392 n = tcp_skb_next(skb, list); 5393 5393 5394 + if (!skb_frags_readable(skb)) 5395 + goto skip_this; 5396 + 5394 5397 /* No new bits? It is possible on ofo queue. */ 5395 5398 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 5396 5399 skb = tcp_collapse_one(sk, skb, list, root); ··· 5414 5411 break; 5415 5412 } 5416 5413 5417 - if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && 5414 + if (n && n != tail && skb_frags_readable(n) && 5415 + tcp_skb_can_collapse_rx(skb, n) && 5418 5416 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { 5419 5417 end_of_skbs = false; 5420 5418 break; 5421 5419 } 5422 5420 5421 + skip_this: 5423 5422 /* Decided to skip this, advance start seq. */ 5424 5423 start = TCP_SKB_CB(skb)->end_seq; 5425 5424 } 5426 5425 if (end_of_skbs || 5427 - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 5426 + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || 5427 + !skb_frags_readable(skb)) 5428 5428 return; 5429 5429 5430 5430 __skb_queue_head_init(&tmp); ··· 5469 5463 if (!skb || 5470 5464 skb == tail || 5471 5465 !tcp_skb_can_collapse_rx(nskb, skb) || 5472 - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 5466 + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || 5467 + !skb_frags_readable(skb)) 5473 5468 goto end; 5474 5469 } 5475 5470 }
+4 -1
net/ipv4/tcp_output.c
··· 2344 2344 2345 2345 if (unlikely(TCP_SKB_CB(skb)->eor) || 2346 2346 tcp_has_tx_tstamp(skb) || 2347 - !skb_pure_zcopy_same(skb, next)) 2347 + !skb_pure_zcopy_same(skb, next) || 2348 + skb_frags_readable(skb) != skb_frags_readable(next)) 2348 2349 return false; 2349 2350 2350 2351 len -= skb->len; ··· 3264 3263 if (tcp_skb_pcount(skb) > 1) 3265 3264 return false; 3266 3265 if (skb_cloned(skb)) 3266 + return false; 3267 + if (!skb_frags_readable(skb)) 3267 3268 return false; 3268 3269 /* Some heuristics for collapsing over SACK'd could be invented */ 3269 3270 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+2 -2
net/packet/af_packet.c
··· 2216 2216 } 2217 2217 } 2218 2218 2219 - snaplen = skb->len; 2219 + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); 2220 2220 2221 2221 res = run_filter(skb, sk, snaplen); 2222 2222 if (!res) ··· 2336 2336 } 2337 2337 } 2338 2338 2339 - snaplen = skb->len; 2339 + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); 2340 2340 2341 2341 res = run_filter(skb, sk, snaplen); 2342 2342 if (!res)