blk-mq: move cpuhp callback registering out of q->sysfs_lock

Registering and unregistering cpuhp callback requires global cpu hotplug lock,
which is used everywhere. Meantime q->sysfs_lock is used in block layer
almost everywhere.

It is easy to trigger lockdep warning[1] by connecting the two locks.

Fix the warning by moving blk-mq's cpuhp callback registering out of
q->sysfs_lock. Add one dedicated global lock for covering registering &
unregistering hctx's cpuhp, and it is safe to do so because hctx is
guaranteed to be live if our request_queue is live.

[1] https://lore.kernel.org/lkml/Z04pz3AlvI4o0Mr8@agluck-desk3/

Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Newman <peternewman@google.com>
Cc: Babu Moger <babu.moger@amd.com>
Reported-by: Luck Tony <tony.luck@intel.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20241206111611.978870-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Ming Lei and committed by
Jens Axboe
22465bba 4bf485a7

+92 -11
+92 -11
block/blk-mq.c
··· 43 43 44 44 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 45 45 static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); 46 + static DEFINE_MUTEX(blk_mq_cpuhp_lock); 46 47 47 48 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); 48 49 static void blk_mq_request_bypass_insert(struct request *rq, ··· 3740 3739 return 0; 3741 3740 } 3742 3741 3743 - static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3742 + static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3744 3743 { 3745 - if (!(hctx->flags & BLK_MQ_F_STACKING)) 3744 + lockdep_assert_held(&blk_mq_cpuhp_lock); 3745 + 3746 + if (!(hctx->flags & BLK_MQ_F_STACKING) && 3747 + !hlist_unhashed(&hctx->cpuhp_online)) { 3746 3748 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3747 3749 &hctx->cpuhp_online); 3748 - cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3749 - &hctx->cpuhp_dead); 3750 + INIT_HLIST_NODE(&hctx->cpuhp_online); 3751 + } 3752 + 3753 + if (!hlist_unhashed(&hctx->cpuhp_dead)) { 3754 + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3755 + &hctx->cpuhp_dead); 3756 + INIT_HLIST_NODE(&hctx->cpuhp_dead); 3757 + } 3758 + } 3759 + 3760 + static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3761 + { 3762 + mutex_lock(&blk_mq_cpuhp_lock); 3763 + __blk_mq_remove_cpuhp(hctx); 3764 + mutex_unlock(&blk_mq_cpuhp_lock); 3765 + } 3766 + 3767 + static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) 3768 + { 3769 + lockdep_assert_held(&blk_mq_cpuhp_lock); 3770 + 3771 + if (!(hctx->flags & BLK_MQ_F_STACKING) && 3772 + hlist_unhashed(&hctx->cpuhp_online)) 3773 + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3774 + &hctx->cpuhp_online); 3775 + 3776 + if (hlist_unhashed(&hctx->cpuhp_dead)) 3777 + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3778 + &hctx->cpuhp_dead); 3779 + } 3780 + 3781 + static void __blk_mq_remove_cpuhp_list(struct list_head *head) 3782 + { 3783 + struct blk_mq_hw_ctx *hctx; 3784 + 3785 + lockdep_assert_held(&blk_mq_cpuhp_lock); 3786 + 3787 + list_for_each_entry(hctx, head, hctx_list) 3788 + __blk_mq_remove_cpuhp(hctx); 3789 + } 3790 + 3791 + /* 3792 + * Unregister cpuhp callbacks from exited hw queues 3793 + * 3794 + * Safe to call if this `request_queue` is live 3795 + */ 3796 + static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) 3797 + { 3798 + LIST_HEAD(hctx_list); 3799 + 3800 + spin_lock(&q->unused_hctx_lock); 3801 + list_splice_init(&q->unused_hctx_list, &hctx_list); 3802 + spin_unlock(&q->unused_hctx_lock); 3803 + 3804 + mutex_lock(&blk_mq_cpuhp_lock); 3805 + __blk_mq_remove_cpuhp_list(&hctx_list); 3806 + mutex_unlock(&blk_mq_cpuhp_lock); 3807 + 3808 + spin_lock(&q->unused_hctx_lock); 3809 + list_splice(&hctx_list, &q->unused_hctx_list); 3810 + spin_unlock(&q->unused_hctx_lock); 3811 + } 3812 + 3813 + /* 3814 + * Register cpuhp callbacks from all hw queues 3815 + * 3816 + * Safe to call if this `request_queue` is live 3817 + */ 3818 + static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) 3819 + { 3820 + struct blk_mq_hw_ctx *hctx; 3821 + unsigned long i; 3822 + 3823 + mutex_lock(&blk_mq_cpuhp_lock); 3824 + queue_for_each_hw_ctx(q, hctx, i) 3825 + __blk_mq_add_cpuhp(hctx); 3826 + mutex_unlock(&blk_mq_cpuhp_lock); 3750 3827 } 3751 3828 3752 3829 /* ··· 3875 3796 if (set->ops->exit_hctx) 3876 3797 set->ops->exit_hctx(hctx, hctx_idx); 3877 3798 3878 - blk_mq_remove_cpuhp(hctx); 3879 - 3880 3799 xa_erase(&q->hctx_table, hctx_idx); 3881 3800 3882 3801 spin_lock(&q->unused_hctx_lock); ··· 3891 3814 queue_for_each_hw_ctx(q, hctx, i) { 3892 3815 if (i == nr_queue) 3893 3816 break; 3817 + blk_mq_remove_cpuhp(hctx); 3894 3818 blk_mq_exit_hctx(q, set, hctx, i); 3895 3819 } 3896 3820 } ··· 3914 3836 3915 3837 if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) 3916 3838 goto exit_flush_rq; 3917 - 3918 - if (!(hctx->flags & BLK_MQ_F_STACKING)) 3919 - cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3920 - &hctx->cpuhp_online); 3921 - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 3922 3839 3923 3840 return 0; 3924 3841 ··· 3949 3876 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 3950 3877 spin_lock_init(&hctx->lock); 3951 3878 INIT_LIST_HEAD(&hctx->dispatch); 3879 + INIT_HLIST_NODE(&hctx->cpuhp_dead); 3880 + INIT_HLIST_NODE(&hctx->cpuhp_online); 3952 3881 hctx->queue = q; 3953 3882 hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 3954 3883 ··· 4489 4414 xa_for_each_start(&q->hctx_table, j, hctx, j) 4490 4415 blk_mq_exit_hctx(q, set, hctx, j); 4491 4416 mutex_unlock(&q->sysfs_lock); 4417 + 4418 + /* unregister cpuhp callbacks for exited hctxs */ 4419 + blk_mq_remove_hw_queues_cpuhp(q); 4420 + 4421 + /* register cpuhp for new initialized hctxs */ 4422 + blk_mq_add_hw_queues_cpuhp(q); 4492 4423 } 4493 4424 4494 4425 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,