Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: flush the softnet backlog in process context

Currently in process_backlog(), the process_queue dequeuing is
performed with local IRQ disabled, to protect against
flush_backlog(), which runs in hard IRQ context.

This patch moves the flush operation to a work queue and runs the
callback with bottom half disabled to protect the process_queue
against dequeuing.
Since process_queue is now always manipulated in bottom half context,
the irq disable/enable pair around the dequeue operation are removed.

To keep the flush time as low as possible, the flush
works are scheduled on all online cpu simultaneously, using the
high priority work-queue and statically allocated, per cpu,
work structs.

Overall this change increases the time required to destroy a device
to improve slightly the packets reinjection performances.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Paolo Abeni and committed by
David S. Miller
145dd5f9 72f4af4e

+52 -24
+52 -24
net/core/dev.c
··· 4292 4292 } 4293 4293 EXPORT_SYMBOL(netif_receive_skb); 4294 4294 4295 - /* Network device is going away, flush any packets still pending 4296 - * Called with irqs disabled. 4297 - */ 4298 - static void flush_backlog(void *arg) 4299 - { 4300 - struct net_device *dev = arg; 4301 - struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4302 - struct sk_buff *skb, *tmp; 4295 + struct flush_work { 4296 + struct net_device *dev; 4297 + struct work_struct work; 4298 + }; 4303 4299 4300 + DEFINE_PER_CPU(struct flush_work, flush_works); 4301 + 4302 + /* Network device is going away, flush any packets still pending */ 4303 + static void flush_backlog(struct work_struct *work) 4304 + { 4305 + struct flush_work *flush = container_of(work, typeof(*flush), work); 4306 + struct net_device *dev = flush->dev; 4307 + struct sk_buff *skb, *tmp; 4308 + struct softnet_data *sd; 4309 + 4310 + local_bh_disable(); 4311 + sd = this_cpu_ptr(&softnet_data); 4312 + 4313 + local_irq_disable(); 4304 4314 rps_lock(sd); 4305 4315 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4306 4316 if (skb->dev == dev) { ··· 4320 4310 } 4321 4311 } 4322 4312 rps_unlock(sd); 4313 + local_irq_enable(); 4323 4314 4324 4315 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4325 4316 if (skb->dev == dev) { ··· 4329 4318 input_queue_head_incr(sd); 4330 4319 } 4331 4320 } 4321 + local_bh_enable(); 4322 + } 4323 + 4324 + static void flush_all_backlogs(struct net_device *dev) 4325 + { 4326 + unsigned int cpu; 4327 + 4328 + get_online_cpus(); 4329 + 4330 + for_each_online_cpu(cpu) { 4331 + struct flush_work *flush = per_cpu_ptr(&flush_works, cpu); 4332 + 4333 + INIT_WORK(&flush->work, flush_backlog); 4334 + flush->dev = dev; 4335 + queue_work_on(cpu, system_highpri_wq, &flush->work); 4336 + } 4337 + 4338 + for_each_online_cpu(cpu) 4339 + flush_work(&per_cpu_ptr(&flush_works, cpu)->work); 4340 + 4341 + put_online_cpus(); 4332 4342 } 4333 4343 4334 4344 static int napi_gro_complete(struct sk_buff *skb) ··· 4837 4805 4838 4806 static int process_backlog(struct napi_struct *napi, int quota) 4839 4807 { 4840 - int work = 0; 4841 4808 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4809 + bool again = true; 4810 + int work = 0; 4842 4811 4843 4812 /* Check if we have pending ipi, its better to send them now, 4844 4813 * not waiting net_rx_action() end. ··· 4850 4817 } 4851 4818 4852 4819 napi->weight = weight_p; 4853 - local_irq_disable(); 4854 - while (1) { 4820 + while (again) { 4855 4821 struct sk_buff *skb; 4856 4822 4857 4823 while ((skb = __skb_dequeue(&sd->process_queue))) { 4858 4824 rcu_read_lock(); 4859 - local_irq_enable(); 4860 4825 __netif_receive_skb(skb); 4861 4826 rcu_read_unlock(); 4862 - local_irq_disable(); 4863 4827 input_queue_head_incr(sd); 4864 - if (++work >= quota) { 4865 - local_irq_enable(); 4828 + if (++work >= quota) 4866 4829 return work; 4867 - } 4830 + 4868 4831 } 4869 4832 4833 + local_irq_disable(); 4870 4834 rps_lock(sd); 4871 4835 if (skb_queue_empty(&sd->input_pkt_queue)) { 4872 4836 /* ··· 4875 4845 * and we dont need an smp_mb() memory barrier. 4876 4846 */ 4877 4847 napi->state = 0; 4878 - rps_unlock(sd); 4879 - 4880 - break; 4848 + again = false; 4849 + } else { 4850 + skb_queue_splice_tail_init(&sd->input_pkt_queue, 4851 + &sd->process_queue); 4881 4852 } 4882 - 4883 - skb_queue_splice_tail_init(&sd->input_pkt_queue, 4884 - &sd->process_queue); 4885 4853 rps_unlock(sd); 4854 + local_irq_enable(); 4886 4855 } 4887 - local_irq_enable(); 4888 4856 4889 4857 return work; 4890 4858 } ··· 6735 6707 unlist_netdevice(dev); 6736 6708 6737 6709 dev->reg_state = NETREG_UNREGISTERING; 6738 - on_each_cpu(flush_backlog, dev, 1); 6710 + flush_all_backlogs(dev); 6739 6711 } 6740 6712 6741 6713 synchronize_net();