Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'virtio-net-fix-the-deadlock-when-disabling-rx-napi'

Bui Quang Minh says:

====================
virtio-net: fix the deadlock when disabling rx NAPI

Calling napi_disable() on an already disabled napi can cause the
deadlock. In commit 4bc12818b363 ("virtio-net: disable delayed refill
when pausing rx"), to avoid the deadlock, when pausing the RX in
virtnet_rx_pause[_all](), we disable and cancel the delayed refill work.
However, in the virtnet_rx_resume_all(), we enable the delayed refill
work too early before enabling all the receive queue napis.

The deadlock can be reproduced by running
selftests/drivers/net/hw/xsk_reconfig.py with multiqueue virtio-net
device and inserting a cond_resched() inside the for loop in
virtnet_rx_resume_all() to increase the success rate. Because the worker
processing the delayed refilled work runs on the same CPU as
virtnet_rx_resume_all(), a reschedule is needed to cause the deadlock.
In real scenario, the contention on netdev_lock can cause the
reschedule.

Due to the complexity of delayed refill worker, in this series, we remove
it. When we fail to refill the receive buffer, we will retry in the next
NAPI poll instead.

- Patch 1: removes delayed refill worker schedule and retry refill
in next NAPI
- Patch 2, 3: removes and clean up unused delayed refill worker code

For testing, I've run the following tests with no issue so far
- selftests/drivers/net/hw/xsk_reconfig.py which sets up the XDP zerocopy
without providing any descriptors to the fill ring. As a result,
try_fill_recv will always fail.
- Send TCP packets from host to guest while guest is nearly OOM and some
try_fill_recv calls fail.

v2: https://lore.kernel.org/20260102152023.10773-1-minhquangbui99@gmail.com
v1: https://lore.kernel.org/20251223152533.24364-1-minhquangbui99@gmail.com

Link to the previous approach and discussion:
https://lore.kernel.org/20251212152741.11656-1-minhquangbui99@gmail.com
====================

Link: https://patch.msgid.link/20260106150438.7425-1-minhquangbui99@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+34 -129
+34 -129
drivers/net/virtio_net.c
··· 441 441 /* Packet virtio header size */ 442 442 u8 hdr_len; 443 443 444 - /* Work struct for delayed refilling if we run low on memory. */ 445 - struct delayed_work refill; 446 - 447 444 /* UDP tunnel support */ 448 445 bool tx_tnl; 449 446 450 447 bool rx_tnl; 451 448 452 449 bool rx_tnl_csum; 453 - 454 - /* Is delayed refill enabled? */ 455 - bool refill_enabled; 456 - 457 - /* The lock to synchronize the access to refill_enabled */ 458 - spinlock_t refill_lock; 459 450 460 451 /* Work struct for config space updates */ 461 452 struct work_struct config_work; ··· 709 718 give_pages(rq, buf); 710 719 else 711 720 put_page(virt_to_head_page(buf)); 712 - } 713 - 714 - static void enable_delayed_refill(struct virtnet_info *vi) 715 - { 716 - spin_lock_bh(&vi->refill_lock); 717 - vi->refill_enabled = true; 718 - spin_unlock_bh(&vi->refill_lock); 719 - } 720 - 721 - static void disable_delayed_refill(struct virtnet_info *vi) 722 - { 723 - spin_lock_bh(&vi->refill_lock); 724 - vi->refill_enabled = false; 725 - spin_unlock_bh(&vi->refill_lock); 726 721 } 727 722 728 723 static void enable_rx_mode_work(struct virtnet_info *vi) ··· 2925 2948 napi_disable(napi); 2926 2949 } 2927 2950 2928 - static void refill_work(struct work_struct *work) 2929 - { 2930 - struct virtnet_info *vi = 2931 - container_of(work, struct virtnet_info, refill.work); 2932 - bool still_empty; 2933 - int i; 2934 - 2935 - for (i = 0; i < vi->curr_queue_pairs; i++) { 2936 - struct receive_queue *rq = &vi->rq[i]; 2937 - 2938 - /* 2939 - * When queue API support is added in the future and the call 2940 - * below becomes napi_disable_locked, this driver will need to 2941 - * be refactored. 2942 - * 2943 - * One possible solution would be to: 2944 - * - cancel refill_work with cancel_delayed_work (note: 2945 - * non-sync) 2946 - * - cancel refill_work with cancel_delayed_work_sync in 2947 - * virtnet_remove after the netdev is unregistered 2948 - * - wrap all of the work in a lock (perhaps the netdev 2949 - * instance lock) 2950 - * - check netif_running() and return early to avoid a race 2951 - */ 2952 - napi_disable(&rq->napi); 2953 - still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); 2954 - virtnet_napi_do_enable(rq->vq, &rq->napi); 2955 - 2956 - /* In theory, this can happen: if we don't get any buffers in 2957 - * we will *never* try to fill again. 2958 - */ 2959 - if (still_empty) 2960 - schedule_delayed_work(&vi->refill, HZ/2); 2961 - } 2962 - } 2963 - 2964 2951 static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, 2965 2952 struct receive_queue *rq, 2966 2953 int budget, ··· 2987 3046 else 2988 3047 packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); 2989 3048 3049 + u64_stats_set(&stats.packets, packets); 2990 3050 if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { 2991 - if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { 2992 - spin_lock(&vi->refill_lock); 2993 - if (vi->refill_enabled) 2994 - schedule_delayed_work(&vi->refill, 0); 2995 - spin_unlock(&vi->refill_lock); 2996 - } 3051 + if (!try_fill_recv(vi, rq, GFP_ATOMIC)) 3052 + /* We need to retry refilling in the next NAPI poll so 3053 + * we must return budget to make sure the NAPI is 3054 + * repolled. 3055 + */ 3056 + packets = budget; 2997 3057 } 2998 3058 2999 - u64_stats_set(&stats.packets, packets); 3000 3059 u64_stats_update_begin(&rq->stats.syncp); 3001 3060 for (i = 0; i < ARRAY_SIZE(virtnet_rq_stats_desc); i++) { 3002 3061 size_t offset = virtnet_rq_stats_desc[i].offset; ··· 3167 3226 struct virtnet_info *vi = netdev_priv(dev); 3168 3227 int i, err; 3169 3228 3170 - enable_delayed_refill(vi); 3171 - 3172 3229 for (i = 0; i < vi->max_queue_pairs; i++) { 3173 3230 if (i < vi->curr_queue_pairs) 3174 - /* Make sure we have some buffers: if oom use wq. */ 3175 - if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 3176 - schedule_delayed_work(&vi->refill, 0); 3231 + /* Pre-fill rq agressively, to make sure we are ready to 3232 + * get packets immediately. 3233 + */ 3234 + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL); 3177 3235 3178 3236 err = virtnet_enable_queue_pair(vi, i); 3179 3237 if (err < 0) ··· 3191 3251 return 0; 3192 3252 3193 3253 err_enable_qp: 3194 - disable_delayed_refill(vi); 3195 - cancel_delayed_work_sync(&vi->refill); 3196 - 3197 3254 for (i--; i >= 0; i--) { 3198 3255 virtnet_disable_queue_pair(vi, i); 3199 3256 virtnet_cancel_dim(vi, &vi->rq[i].dim); ··· 3369 3432 return NETDEV_TX_OK; 3370 3433 } 3371 3434 3372 - static void __virtnet_rx_pause(struct virtnet_info *vi, 3373 - struct receive_queue *rq) 3435 + static void virtnet_rx_pause(struct virtnet_info *vi, 3436 + struct receive_queue *rq) 3374 3437 { 3375 3438 bool running = netif_running(vi->dev); 3376 3439 ··· 3384 3447 { 3385 3448 int i; 3386 3449 3387 - /* 3388 - * Make sure refill_work does not run concurrently to 3389 - * avoid napi_disable race which leads to deadlock. 3390 - */ 3391 - disable_delayed_refill(vi); 3392 - cancel_delayed_work_sync(&vi->refill); 3393 3450 for (i = 0; i < vi->max_queue_pairs; i++) 3394 - __virtnet_rx_pause(vi, &vi->rq[i]); 3451 + virtnet_rx_pause(vi, &vi->rq[i]); 3395 3452 } 3396 3453 3397 - static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) 3454 + static void virtnet_rx_resume(struct virtnet_info *vi, 3455 + struct receive_queue *rq, 3456 + bool refill) 3398 3457 { 3399 - /* 3400 - * Make sure refill_work does not run concurrently to 3401 - * avoid napi_disable race which leads to deadlock. 3402 - */ 3403 - disable_delayed_refill(vi); 3404 - cancel_delayed_work_sync(&vi->refill); 3405 - __virtnet_rx_pause(vi, rq); 3406 - } 3458 + if (netif_running(vi->dev)) { 3459 + /* Pre-fill rq agressively, to make sure we are ready to get 3460 + * packets immediately. 3461 + */ 3462 + if (refill) 3463 + try_fill_recv(vi, rq, GFP_KERNEL); 3407 3464 3408 - static void __virtnet_rx_resume(struct virtnet_info *vi, 3409 - struct receive_queue *rq, 3410 - bool refill) 3411 - { 3412 - bool running = netif_running(vi->dev); 3413 - bool schedule_refill = false; 3414 - 3415 - if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) 3416 - schedule_refill = true; 3417 - if (running) 3418 3465 virtnet_napi_enable(rq); 3419 - 3420 - if (schedule_refill) 3421 - schedule_delayed_work(&vi->refill, 0); 3466 + } 3422 3467 } 3423 3468 3424 3469 static void virtnet_rx_resume_all(struct virtnet_info *vi) 3425 3470 { 3426 3471 int i; 3427 3472 3428 - enable_delayed_refill(vi); 3429 3473 for (i = 0; i < vi->max_queue_pairs; i++) { 3430 3474 if (i < vi->curr_queue_pairs) 3431 - __virtnet_rx_resume(vi, &vi->rq[i], true); 3475 + virtnet_rx_resume(vi, &vi->rq[i], true); 3432 3476 else 3433 - __virtnet_rx_resume(vi, &vi->rq[i], false); 3477 + virtnet_rx_resume(vi, &vi->rq[i], false); 3434 3478 } 3435 - } 3436 - 3437 - static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) 3438 - { 3439 - enable_delayed_refill(vi); 3440 - __virtnet_rx_resume(vi, rq, true); 3441 3479 } 3442 3480 3443 3481 static int virtnet_rx_resize(struct virtnet_info *vi, ··· 3428 3516 if (err) 3429 3517 netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); 3430 3518 3431 - virtnet_rx_resume(vi, rq); 3519 + virtnet_rx_resume(vi, rq, true); 3432 3520 return err; 3433 3521 } 3434 3522 ··· 3741 3829 } 3742 3830 succ: 3743 3831 vi->curr_queue_pairs = queue_pairs; 3744 - /* virtnet_open() will refill when device is going to up. */ 3745 - spin_lock_bh(&vi->refill_lock); 3746 - if (dev->flags & IFF_UP && vi->refill_enabled) 3747 - schedule_delayed_work(&vi->refill, 0); 3748 - spin_unlock_bh(&vi->refill_lock); 3832 + if (dev->flags & IFF_UP) { 3833 + local_bh_disable(); 3834 + for (int i = 0; i < vi->curr_queue_pairs; ++i) 3835 + virtqueue_napi_schedule(&vi->rq[i].napi, vi->rq[i].vq); 3836 + local_bh_enable(); 3837 + } 3749 3838 3750 3839 return 0; 3751 3840 } ··· 3756 3843 struct virtnet_info *vi = netdev_priv(dev); 3757 3844 int i; 3758 3845 3759 - /* Make sure NAPI doesn't schedule refill work */ 3760 - disable_delayed_refill(vi); 3761 - /* Make sure refill_work doesn't re-enable napi! */ 3762 - cancel_delayed_work_sync(&vi->refill); 3763 3846 /* Prevent the config change callback from changing carrier 3764 3847 * after close 3765 3848 */ ··· 5711 5802 5712 5803 virtio_device_ready(vdev); 5713 5804 5714 - enable_delayed_refill(vi); 5715 5805 enable_rx_mode_work(vi); 5716 5806 5717 5807 if (netif_running(vi->dev)) { ··· 5800 5892 5801 5893 rq->xsk_pool = pool; 5802 5894 5803 - virtnet_rx_resume(vi, rq); 5895 + virtnet_rx_resume(vi, rq, true); 5804 5896 5805 5897 if (pool) 5806 5898 return 0; ··· 6467 6559 if (!vi->rq) 6468 6560 goto err_rq; 6469 6561 6470 - INIT_DELAYED_WORK(&vi->refill, refill_work); 6471 6562 for (i = 0; i < vi->max_queue_pairs; i++) { 6472 6563 vi->rq[i].pages = NULL; 6473 6564 netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, ··· 6808 6901 6809 6902 INIT_WORK(&vi->config_work, virtnet_config_changed_work); 6810 6903 INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work); 6811 - spin_lock_init(&vi->refill_lock); 6812 6904 6813 6905 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { 6814 6906 vi->mergeable_rx_bufs = true; ··· 7071 7165 net_failover_destroy(vi->failover); 7072 7166 free_vqs: 7073 7167 virtio_reset_device(vdev); 7074 - cancel_delayed_work_sync(&vi->refill); 7075 7168 free_receive_page_frags(vi); 7076 7169 virtnet_del_vqs(vi); 7077 7170 free: