Merge branch 'net-skb-defer-freeing-polish'

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Eric Dumazet says:

====================
net: polish skb defer freeing

While testing this recently added feature on a variety
of platforms/configurations, I found the following issues:

1) A race leading to concurrent calls to smp_call_function_single_async()

2) Missed opportunity to use napi_consume_skb()

3) Need to limit the max length of the per-cpu lists.

4) Process the per-cpu list more frequently, for the
(unusual) case where net_rx_action() has mutiple
napi_poll() to process per round.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

David S. Miller 3 years ago ee3398c7 3daebfbe

+38 -14

6 changed files

expand all

Documentation

admin-guide

sysctl

net.rst

include

linux

netdevice.h

net

core

dev.c

dev.h

skbuff.c

sysctl_net_core.c

Documentation/admin-guide/sysctl/net.rst

··· 322 322 warnings on slow/loaded systems. 323 323 Default value is 10, minimum 1, maximum 3600. 324 324 325 + skb_defer_max 326 + ------------- 327 + 328 + Max size (in skbs) of the per-cpu list of skbs being freed 329 + by the cpu which allocated them. Used by TCP stack so far. 330 + 331 + Default: 64 332 + 325 333 optmem_max 326 334 ---------- 327 335

include/linux/netdevice.h

··· 3136 3136 /* Another possibly contended cache line */ 3137 3137 spinlock_t defer_lock ____cacheline_aligned_in_smp; 3138 3138 int defer_count; 3139 + int defer_ipi_scheduled; 3139 3140 struct sk_buff *defer_list; 3140 3141 call_single_data_t defer_csd; 3141 3142 };

+10 -5

net/core/dev.c

··· 4330 4330 EXPORT_SYMBOL(netdev_max_backlog); 4331 4331 4332 4332 int netdev_tstamp_prequeue __read_mostly = 1; 4333 + unsigned int sysctl_skb_defer_max __read_mostly = 64; 4333 4334 int netdev_budget __read_mostly = 300; 4334 4335 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ 4335 4336 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; ··· 4583 4582 #endif /* CONFIG_RPS */ 4584 4583 4585 4584 /* Called from hardirq (IPI) context */ 4586 - static void trigger_rx_softirq(void *data __always_unused) 4585 + static void trigger_rx_softirq(void *data) 4587 4586 { 4587 + struct softnet_data *sd = data; 4588 + 4588 4589 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4590 + smp_store_release(&sd->defer_ipi_scheduled, 0); 4589 4591 } 4590 4592 4591 4593 /* ··· 6634 6630 6635 6631 while (skb != NULL) { 6636 6632 next = skb->next; 6637 - __kfree_skb(skb); 6633 + napi_consume_skb(skb, 1); 6638 6634 skb = next; 6639 6635 } 6640 6636 } ··· 6654 6650 6655 6651 for (;;) { 6656 6652 struct napi_struct *n; 6653 + 6654 + skb_defer_free_flush(sd); 6657 6655 6658 6656 if (list_empty(&list)) { 6659 6657 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) ··· 6686 6680 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 6687 6681 6688 6682 net_rps_action_and_irq_enable(sd); 6689 - end: 6690 - skb_defer_free_flush(sd); 6683 + end:; 6691 6684 } 6692 6685 6693 6686 struct netdev_adjacent { ··· 11387 11382 INIT_CSD(&sd->csd, rps_trigger_softirq, sd); 11388 11383 sd->cpu = i; 11389 11384 #endif 11390 - INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); 11385 + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); 11391 11386 spin_lock_init(&sd->defer_lock); 11392 11387 11393 11388 init_gro_hash(&sd->backlog);

+1 -1

net/core/dev.h

··· 39 39 /* sysctls not referred to from outside net/core/ */ 40 40 extern int netdev_budget; 41 41 extern unsigned int netdev_budget_usecs; 42 - 42 + extern unsigned int sysctl_skb_defer_max; 43 43 extern int netdev_tstamp_prequeue; 44 44 extern int netdev_unregister_timeout_secs; 45 45 extern int weight_p;

+10 -8

net/core/skbuff.c

··· 80 80 #include <linux/user_namespace.h> 81 81 #include <linux/indirect_call_wrapper.h> 82 82 83 + #include "dev.h" 83 84 #include "sock_destructor.h" 84 85 85 86 struct kmem_cache *skbuff_head_cache __ro_after_init; ··· 6497 6496 int cpu = skb->alloc_cpu; 6498 6497 struct softnet_data *sd; 6499 6498 unsigned long flags; 6499 + unsigned int defer_max; 6500 6500 bool kick; 6501 6501 6502 6502 if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || 6503 6503 !cpu_online(cpu) || 6504 6504 cpu == raw_smp_processor_id()) { 6505 - __kfree_skb(skb); 6505 + nodefer: __kfree_skb(skb); 6506 6506 return; 6507 6507 } 6508 6508 6509 6509 sd = &per_cpu(softnet_data, cpu); 6510 + defer_max = READ_ONCE(sysctl_skb_defer_max); 6511 + if (READ_ONCE(sd->defer_count) >= defer_max) 6512 + goto nodefer; 6513 + 6510 6514 /* We do not send an IPI or any signal. 6511 6515 * Remote cpu will eventually call skb_defer_free_flush() 6512 6516 */ ··· 6521 6515 WRITE_ONCE(sd->defer_list, skb); 6522 6516 sd->defer_count++; 6523 6517 6524 - /* kick every time queue length reaches 128. 6525 - * This should avoid blocking in smp_call_function_single_async(). 6526 - * This condition should hardly be bit under normal conditions, 6527 - * unless cpu suddenly stopped to receive NIC interrupts. 6528 - */ 6529 - kick = sd->defer_count == 128; 6518 + /* Send an IPI every time queue reaches half capacity. */ 6519 + kick = sd->defer_count == (defer_max >> 1); 6530 6520 6531 6521 spin_unlock_irqrestore(&sd->defer_lock, flags); 6532 6522 6533 6523 /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU 6534 6524 * if we are unlucky enough (this seems very unlikely). 6535 6525 */ 6536 - if (unlikely(kick)) 6526 + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) 6537 6527 smp_call_function_single_async(cpu, &sd->defer_csd); 6538 6528 }

net/core/sysctl_net_core.c

··· 578 578 .extra1 = SYSCTL_ONE, 579 579 .extra2 = &int_3600, 580 580 }, 581 + { 582 + .procname = "skb_defer_max", 583 + .data = &sysctl_skb_defer_max, 584 + .maxlen = sizeof(unsigned int), 585 + .mode = 0644, 586 + .proc_handler = proc_dointvec_minmax, 587 + .extra1 = SYSCTL_ZERO, 588 + }, 581 589 { } 582 590 }; 583 591