Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'remove-sk-skb-caches'

Paolo Abeni says:

====================
net: remove sk skb caches

Eric noted we would be better off reverting the sk
skb caches.

MPTCP relies on such a feature, so we need a
little refactor of the MPTCP tx path before the mentioned
revert.

The first patch exposes additional TCP helpers. The 2nd patch
changes the MPTCP code to do locally the whole skb allocation
and updating, so it does not rely anymore on core TCP helpers
for that nor the sk skb cache.

As a side effect, we can make the tcp_build_frag helper static.

Finally, we can pull Eric's revert.

RFC -> v1:
- drop driver specific patch - no more needed after helper rename
- rename skb_entail -> tcp_skb_entail (Eric)
- preserve the tcp_build_frag helpwe, just make it static (Eric)
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+85 -149
-8
Documentation/networking/ip-sysctl.rst
··· 989 989 in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) 990 990 Default: 1000 991 991 992 - tcp_rx_skb_cache - BOOLEAN 993 - Controls a per TCP socket cache of one skb, that might help 994 - performance of some workloads. This might be dangerous 995 - on systems with a lot of TCP sockets, since it increases 996 - memory usage. 997 - 998 - Default: 0 (disabled) 999 - 1000 992 UDP variables 1001 993 ============= 1002 994
-19
include/net/sock.h
··· 262 262 * @sk_dst_cache: destination cache 263 263 * @sk_dst_pending_confirm: need to confirm neighbour 264 264 * @sk_policy: flow policy 265 - * @sk_rx_skb_cache: cache copy of recently accessed RX skb 266 265 * @sk_receive_queue: incoming packets 267 266 * @sk_wmem_alloc: transmit queue bytes committed 268 267 * @sk_tsq_flags: TCP Small Queues flags ··· 327 328 * @sk_peek_off: current peek_offset value 328 329 * @sk_send_head: front of stuff to transmit 329 330 * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] 330 - * @sk_tx_skb_cache: cache copy of recently accessed TX skb 331 331 * @sk_security: used by security modules 332 332 * @sk_mark: generic packet mark 333 333 * @sk_cgrp_data: cgroup data for this cgroup ··· 391 393 atomic_t sk_drops; 392 394 int sk_rcvlowat; 393 395 struct sk_buff_head sk_error_queue; 394 - struct sk_buff *sk_rx_skb_cache; 395 396 struct sk_buff_head sk_receive_queue; 396 397 /* 397 398 * The backlog queue is special, it is always used with ··· 439 442 struct sk_buff *sk_send_head; 440 443 struct rb_root tcp_rtx_queue; 441 444 }; 442 - struct sk_buff *sk_tx_skb_cache; 443 445 struct sk_buff_head sk_write_queue; 444 446 __s32 sk_peek_off; 445 447 int sk_write_pending; ··· 1551 1555 __sk_mem_reclaim(sk, 1 << 20); 1552 1556 } 1553 1557 1554 - DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); 1555 1558 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) 1556 1559 { 1557 1560 sk_wmem_queued_add(sk, -skb->truesize); 1558 1561 sk_mem_uncharge(sk, skb->truesize); 1559 - if (static_branch_unlikely(&tcp_tx_skb_cache_key) && 1560 - !sk->sk_tx_skb_cache && !skb_cloned(skb)) { 1561 - skb_ext_reset(skb); 1562 - skb_zcopy_clear(skb, true); 1563 - sk->sk_tx_skb_cache = skb; 1564 - return; 1565 - } 1566 1562 __kfree_skb(skb); 1567 1563 } 1568 1564 ··· 2563 2575 &skb_shinfo(skb)->tskey); 2564 2576 } 2565 2577 2566 - DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); 2567 2578 /** 2568 2579 * sk_eat_skb - Release a skb if it is no longer needed 2569 2580 * @sk: socket to eat this skb from ··· 2574 2587 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) 2575 2588 { 2576 2589 __skb_unlink(skb, &sk->sk_receive_queue); 2577 - if (static_branch_unlikely(&tcp_rx_skb_cache_key) && 2578 - !sk->sk_rx_skb_cache) { 2579 - sk->sk_rx_skb_cache = skb; 2580 - skb_orphan(skb); 2581 - return; 2582 - } 2583 2590 __kfree_skb(skb); 2584 2591 } 2585 2592
+2 -2
include/net/tcp.h
··· 330 330 int flags); 331 331 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, 332 332 size_t size, int flags); 333 - struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, 334 - struct page *page, int offset, size_t *size); 335 333 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, 336 334 size_t size, int flags); 337 335 int tcp_send_mss(struct sock *sk, int *size_goal, int flags); ··· 579 581 #endif 580 582 /* tcp_output.c */ 581 583 584 + void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); 585 + void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); 582 586 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 583 587 int nonagle); 584 588 int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
-4
net/ipv4/af_inet.c
··· 133 133 struct inet_sock *inet = inet_sk(sk); 134 134 135 135 __skb_queue_purge(&sk->sk_receive_queue); 136 - if (sk->sk_rx_skb_cache) { 137 - __kfree_skb(sk->sk_rx_skb_cache); 138 - sk->sk_rx_skb_cache = NULL; 139 - } 140 136 __skb_queue_purge(&sk->sk_error_queue); 141 137 142 138 sk_mem_reclaim(sk);
-12
net/ipv4/sysctl_net_ipv4.c
··· 585 585 .extra1 = &sysctl_fib_sync_mem_min, 586 586 .extra2 = &sysctl_fib_sync_mem_max, 587 587 }, 588 - { 589 - .procname = "tcp_rx_skb_cache", 590 - .data = &tcp_rx_skb_cache_key.key, 591 - .mode = 0644, 592 - .proc_handler = proc_do_static_key, 593 - }, 594 - { 595 - .procname = "tcp_tx_skb_cache", 596 - .data = &tcp_tx_skb_cache_key.key, 597 - .mode = 0644, 598 - .proc_handler = proc_do_static_key, 599 - }, 600 588 { } 601 589 }; 602 590
+6 -32
net/ipv4/tcp.c
··· 325 325 unsigned long tcp_memory_pressure __read_mostly; 326 326 EXPORT_SYMBOL_GPL(tcp_memory_pressure); 327 327 328 - DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); 329 - EXPORT_SYMBOL(tcp_rx_skb_cache_key); 330 - 331 - DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); 332 - 333 328 void tcp_enter_memory_pressure(struct sock *sk) 334 329 { 335 330 unsigned long val; ··· 642 647 } 643 648 EXPORT_SYMBOL(tcp_ioctl); 644 649 645 - static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 650 + void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 646 651 { 647 652 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 648 653 tp->pushed_seq = tp->write_seq; ··· 653 658 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 654 659 } 655 660 656 - static void skb_entail(struct sock *sk, struct sk_buff *skb) 661 + void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) 657 662 { 658 663 struct tcp_sock *tp = tcp_sk(sk); 659 664 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ··· 861 866 { 862 867 struct sk_buff *skb; 863 868 864 - if (likely(!size)) { 865 - skb = sk->sk_tx_skb_cache; 866 - if (skb) { 867 - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 868 - sk->sk_tx_skb_cache = NULL; 869 - pskb_trim(skb, 0); 870 - INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 871 - skb_shinfo(skb)->tx_flags = 0; 872 - memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); 873 - return skb; 874 - } 875 - } 876 869 /* The TCP header must be at least 32-bit aligned. */ 877 870 size = ALIGN(size, 4); 878 871 ··· 946 963 } 947 964 } 948 965 949 - struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, 950 - struct page *page, int offset, size_t *size) 966 + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, 967 + struct page *page, int offset, size_t *size) 951 968 { 952 969 struct sk_buff *skb = tcp_write_queue_tail(sk); 953 970 struct tcp_sock *tp = tcp_sk(sk); ··· 968 985 #ifdef CONFIG_TLS_DEVICE 969 986 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); 970 987 #endif 971 - skb_entail(sk, skb); 988 + tcp_skb_entail(sk, skb); 972 989 copy = size_goal; 973 990 } 974 991 ··· 1297 1314 process_backlog++; 1298 1315 skb->ip_summed = CHECKSUM_PARTIAL; 1299 1316 1300 - skb_entail(sk, skb); 1317 + tcp_skb_entail(sk, skb); 1301 1318 copy = size_goal; 1302 1319 1303 1320 /* All packets are restored as if they have ··· 2903 2920 sk_wmem_free_skb(sk, skb); 2904 2921 } 2905 2922 tcp_rtx_queue_purge(sk); 2906 - skb = sk->sk_tx_skb_cache; 2907 - if (skb) { 2908 - __kfree_skb(skb); 2909 - sk->sk_tx_skb_cache = NULL; 2910 - } 2911 2923 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); 2912 2924 sk_mem_reclaim(sk); 2913 2925 tcp_clear_all_retrans_hints(tcp_sk(sk)); ··· 2939 2961 2940 2962 tcp_clear_xmit_timers(sk); 2941 2963 __skb_queue_purge(&sk->sk_receive_queue); 2942 - if (sk->sk_rx_skb_cache) { 2943 - __kfree_skb(sk->sk_rx_skb_cache); 2944 - sk->sk_rx_skb_cache = NULL; 2945 - } 2946 2964 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 2947 2965 tp->urg_data = 0; 2948 2966 tcp_write_queue_purge(sk);
-6
net/ipv4/tcp_ipv4.c
··· 1941 1941 int tcp_v4_rcv(struct sk_buff *skb) 1942 1942 { 1943 1943 struct net *net = dev_net(skb->dev); 1944 - struct sk_buff *skb_to_free; 1945 1944 int sdif = inet_sdif(skb); 1946 1945 int dif = inet_iif(skb); 1947 1946 const struct iphdr *iph; ··· 2081 2082 tcp_segs_in(tcp_sk(sk), skb); 2082 2083 ret = 0; 2083 2084 if (!sock_owned_by_user(sk)) { 2084 - skb_to_free = sk->sk_rx_skb_cache; 2085 - sk->sk_rx_skb_cache = NULL; 2086 2085 ret = tcp_v4_do_rcv(sk, skb); 2087 2086 } else { 2088 2087 if (tcp_add_backlog(sk, skb)) 2089 2088 goto discard_and_relse; 2090 - skb_to_free = NULL; 2091 2089 } 2092 2090 bh_unlock_sock(sk); 2093 - if (skb_to_free) 2094 - __kfree_skb(skb_to_free); 2095 2091 2096 2092 put_and_return: 2097 2093 if (refcounted)
-6
net/ipv6/tcp_ipv6.c
··· 1618 1618 1619 1619 INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) 1620 1620 { 1621 - struct sk_buff *skb_to_free; 1622 1621 int sdif = inet6_sdif(skb); 1623 1622 int dif = inet6_iif(skb); 1624 1623 const struct tcphdr *th; ··· 1753 1754 tcp_segs_in(tcp_sk(sk), skb); 1754 1755 ret = 0; 1755 1756 if (!sock_owned_by_user(sk)) { 1756 - skb_to_free = sk->sk_rx_skb_cache; 1757 - sk->sk_rx_skb_cache = NULL; 1758 1757 ret = tcp_v6_do_rcv(sk, skb); 1759 1758 } else { 1760 1759 if (tcp_add_backlog(sk, skb)) 1761 1760 goto discard_and_relse; 1762 - skb_to_free = NULL; 1763 1761 } 1764 1762 bh_unlock_sock(sk); 1765 - if (skb_to_free) 1766 - __kfree_skb(skb_to_free); 1767 1763 put_and_return: 1768 1764 if (refcounted) 1769 1765 sock_put(sk);
+77 -60
net/mptcp/protocol.c
··· 1224 1224 if (likely(__mptcp_add_ext(skb, gfp))) { 1225 1225 skb_reserve(skb, MAX_TCP_HEADER); 1226 1226 skb->reserved_tailroom = skb->end - skb->tail; 1227 + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); 1227 1228 return skb; 1228 1229 } 1229 1230 __kfree_skb(skb); ··· 1234 1233 return NULL; 1235 1234 } 1236 1235 1237 - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) 1236 + static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) 1238 1237 { 1239 1238 struct sk_buff *skb; 1240 1239 1241 - if (ssk->sk_tx_skb_cache) { 1242 - skb = ssk->sk_tx_skb_cache; 1243 - if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && 1244 - !__mptcp_add_ext(skb, gfp))) 1245 - return false; 1246 - return true; 1247 - } 1248 - 1249 1240 skb = __mptcp_do_alloc_tx_skb(sk, gfp); 1250 1241 if (!skb) 1251 - return false; 1242 + return NULL; 1252 1243 1253 1244 if (likely(sk_wmem_schedule(ssk, skb->truesize))) { 1254 - ssk->sk_tx_skb_cache = skb; 1255 - return true; 1245 + tcp_skb_entail(ssk, skb); 1246 + return skb; 1256 1247 } 1257 1248 kfree_skb(skb); 1258 - return false; 1249 + return NULL; 1259 1250 } 1260 1251 1261 - static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) 1252 + static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) 1262 1253 { 1263 1254 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 1264 1255 ··· 1280 1287 struct mptcp_sendmsg_info *info) 1281 1288 { 1282 1289 u64 data_seq = dfrag->data_seq + info->sent; 1290 + int offset = dfrag->offset + info->sent; 1283 1291 struct mptcp_sock *msk = mptcp_sk(sk); 1284 1292 bool zero_window_probe = false; 1285 1293 struct mptcp_ext *mpext = NULL; 1286 - struct sk_buff *skb, *tail; 1287 - bool must_collapse = false; 1288 - int size_bias = 0; 1289 - int avail_size; 1290 - size_t ret = 0; 1294 + bool can_coalesce = false; 1295 + bool reuse_skb = true; 1296 + struct sk_buff *skb; 1297 + size_t copy; 1298 + int i; 1291 1299 1292 1300 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 1293 1301 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 1294 1302 1303 + if (WARN_ON_ONCE(info->sent > info->limit || 1304 + info->limit > dfrag->data_len)) 1305 + return 0; 1306 + 1295 1307 /* compute send limit */ 1296 1308 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); 1297 - avail_size = info->size_goal; 1309 + copy = info->size_goal; 1310 + 1298 1311 skb = tcp_write_queue_tail(ssk); 1299 - if (skb) { 1312 + if (skb && copy > skb->len) { 1300 1313 /* Limit the write to the size available in the 1301 1314 * current skb, if any, so that we create at most a new skb. 1302 1315 * Explicitly tells TCP internals to avoid collapsing on later ··· 1315 1316 goto alloc_skb; 1316 1317 } 1317 1318 1318 - must_collapse = (info->size_goal > skb->len) && 1319 - (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); 1320 - if (must_collapse) { 1321 - size_bias = skb->len; 1322 - avail_size = info->size_goal - skb->len; 1319 + i = skb_shinfo(skb)->nr_frags; 1320 + can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); 1321 + if (!can_coalesce && i >= sysctl_max_skb_frags) { 1322 + tcp_mark_push(tcp_sk(ssk), skb); 1323 + goto alloc_skb; 1323 1324 } 1324 - } 1325 1325 1326 + copy -= skb->len; 1327 + } else { 1326 1328 alloc_skb: 1327 - if (!must_collapse && 1328 - !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) 1329 - return 0; 1329 + skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); 1330 + if (!skb) 1331 + return -ENOMEM; 1332 + 1333 + i = skb_shinfo(skb)->nr_frags; 1334 + reuse_skb = false; 1335 + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1336 + } 1330 1337 1331 1338 /* Zero window and all data acked? Probe. */ 1332 - avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); 1333 - if (avail_size == 0) { 1339 + copy = mptcp_check_allowed_size(msk, data_seq, copy); 1340 + if (copy == 0) { 1334 1341 u64 snd_una = READ_ONCE(msk->snd_una); 1335 1342 1336 - if (skb || snd_una != msk->snd_nxt) 1343 + if (snd_una != msk->snd_nxt) { 1344 + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); 1337 1345 return 0; 1346 + } 1347 + 1338 1348 zero_window_probe = true; 1339 1349 data_seq = snd_una - 1; 1340 - avail_size = 1; 1350 + copy = 1; 1351 + 1352 + /* all mptcp-level data is acked, no skbs should be present into the 1353 + * ssk write queue 1354 + */ 1355 + WARN_ON_ONCE(reuse_skb); 1341 1356 } 1342 1357 1343 - if (WARN_ON_ONCE(info->sent > info->limit || 1344 - info->limit > dfrag->data_len)) 1345 - return 0; 1346 - 1347 - ret = info->limit - info->sent; 1348 - tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, 1349 - dfrag->page, dfrag->offset + info->sent, &ret); 1350 - if (!tail) { 1351 - tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); 1358 + copy = min_t(size_t, copy, info->limit - info->sent); 1359 + if (!sk_wmem_schedule(ssk, copy)) { 1360 + tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); 1352 1361 return -ENOMEM; 1353 1362 } 1354 1363 1355 - /* if the tail skb is still the cached one, collapsing really happened. 1356 - */ 1357 - if (skb == tail) { 1358 - TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; 1359 - mpext->data_len += ret; 1360 - WARN_ON_ONCE(zero_window_probe); 1361 - goto out; 1364 + if (can_coalesce) { 1365 + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1366 + } else { 1367 + get_page(dfrag->page); 1368 + skb_fill_page_desc(skb, i, dfrag->page, offset, copy); 1362 1369 } 1363 1370 1364 - mpext = skb_ext_find(tail, SKB_EXT_MPTCP); 1365 - if (WARN_ON_ONCE(!mpext)) { 1366 - /* should never reach here, stream corrupted */ 1367 - return -EINVAL; 1371 + skb->len += copy; 1372 + skb->data_len += copy; 1373 + skb->truesize += copy; 1374 + sk_wmem_queued_add(ssk, copy); 1375 + sk_mem_charge(ssk, copy); 1376 + skb->ip_summed = CHECKSUM_PARTIAL; 1377 + WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); 1378 + TCP_SKB_CB(skb)->end_seq += copy; 1379 + tcp_skb_pcount_set(skb, 0); 1380 + 1381 + /* on skb reuse we just need to update the DSS len */ 1382 + if (reuse_skb) { 1383 + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1384 + mpext->data_len += copy; 1385 + WARN_ON_ONCE(zero_window_probe); 1386 + goto out; 1368 1387 } 1369 1388 1370 1389 memset(mpext, 0, sizeof(*mpext)); 1371 1390 mpext->data_seq = data_seq; 1372 1391 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; 1373 - mpext->data_len = ret; 1392 + mpext->data_len = copy; 1374 1393 mpext->use_map = 1; 1375 1394 mpext->dsn64 = 1; 1376 1395 ··· 1397 1380 mpext->dsn64); 1398 1381 1399 1382 if (zero_window_probe) { 1400 - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 1383 + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1401 1384 mpext->frozen = 1; 1402 1385 if (READ_ONCE(msk->csum_enabled)) 1403 - mptcp_update_data_checksum(tail, ret); 1386 + mptcp_update_data_checksum(skb, copy); 1404 1387 tcp_push_pending_frames(ssk); 1405 1388 return 0; 1406 1389 } 1407 1390 out: 1408 1391 if (READ_ONCE(msk->csum_enabled)) 1409 - mptcp_update_data_checksum(tail, ret); 1410 - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 1411 - return ret; 1392 + mptcp_update_data_checksum(skb, copy); 1393 + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1394 + return copy; 1412 1395 } 1413 1396 1414 1397 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \