Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: Fix a race between the cgroup code and request queue initialization

Initialize the request queue lock earlier such that the following
race can no longer occur:

blk_init_queue_node() blkcg_print_blkgs()
blk_alloc_queue_node (1)
q->queue_lock = &q->__queue_lock (2)
blkcg_init_queue(q) (3)
spin_lock_irq(blkg->q->queue_lock) (4)
q->queue_lock = lock (5)
spin_unlock_irq(blkg->q->queue_lock) (6)

(1) allocate an uninitialized queue;
(2) initialize queue_lock to its default internal lock;
(3) initialize blkcg part of request queue, which will create blkg and
then insert it to blkg_list;
(4) traverse blkg_list and find the created blkg, and then take its
queue lock, here it is the default *internal lock*;
(5) *race window*, now queue_lock is overridden with *driver specified
lock*;
(6) now unlock *driver specified lock*, not the locked *internal lock*,
unlock balance breaks.

The changes in this patch are as follows:
- Move the .queue_lock initialization from blk_init_queue_node() into
blk_alloc_queue_node().
- Only override the .queue_lock pointer for legacy queues because it
is not useful for blk-mq queues to override this pointer.
- For all all block drivers that initialize .queue_lock explicitly,
change the blk_alloc_queue() call in the driver into a
blk_alloc_queue_node() call and remove the explicit .queue_lock
initialization. Additionally, initialize the spin lock that will
be used as queue lock earlier if necessary.

Reported-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Bart Van Assche and committed by
Jens Axboe
498f6650 5ee0524b

+20 -14
+16 -8
block/blk-core.c
··· 888 888 kblockd_schedule_work(&q->timeout_work); 889 889 } 890 890 891 + /** 892 + * blk_alloc_queue_node - allocate a request queue 893 + * @gfp_mask: memory allocation flags 894 + * @node_id: NUMA node to allocate memory from 895 + * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. 896 + * serialize calls to the legacy .request_fn() callback. Ignored for 897 + * blk-mq request queues. 898 + * 899 + * Note: pass the queue lock as the third argument to this function instead of 900 + * setting the queue lock pointer explicitly to avoid triggering a sporadic 901 + * crash in the blkcg code. This function namely calls blkcg_init_queue() and 902 + * the queue lock pointer must be set before blkcg_init_queue() is called. 903 + */ 891 904 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, 892 905 spinlock_t *lock) 893 906 { ··· 953 940 mutex_init(&q->sysfs_lock); 954 941 spin_lock_init(&q->__queue_lock); 955 942 956 - /* 957 - * By default initialize queue_lock to internal lock and driver can 958 - * override it later if need be. 959 - */ 960 - q->queue_lock = &q->__queue_lock; 943 + if (!q->mq_ops) 944 + q->queue_lock = lock ? : &q->__queue_lock; 961 945 962 946 /* 963 947 * A queue starts its life with bypass turned on to avoid ··· 1041 1031 { 1042 1032 struct request_queue *q; 1043 1033 1044 - q = blk_alloc_queue_node(GFP_KERNEL, node_id, NULL); 1034 + q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); 1045 1035 if (!q) 1046 1036 return NULL; 1047 1037 1048 1038 q->request_fn = rfn; 1049 - if (lock) 1050 - q->queue_lock = lock; 1051 1039 if (blk_init_allocated_queue(q) < 0) { 1052 1040 blk_cleanup_queue(q); 1053 1041 return NULL;
+1 -2
drivers/block/drbd/drbd_main.c
··· 2816 2816 2817 2817 drbd_init_set_defaults(device); 2818 2818 2819 - q = blk_alloc_queue(GFP_KERNEL); 2819 + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock); 2820 2820 if (!q) 2821 2821 goto out_no_q; 2822 2822 device->rq_queue = q; ··· 2848 2848 /* Setting the max_hw_sectors to an odd value of 8kibyte here 2849 2849 This triggers a max_bio_size message upon first attach or connect */ 2850 2850 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); 2851 - q->queue_lock = &resource->req_lock; 2852 2851 2853 2852 device->md_io.page = alloc_page(GFP_KERNEL); 2854 2853 if (!device->md_io.page)
+3 -4
drivers/block/umem.c
··· 888 888 card->Active = -1; /* no page is active */ 889 889 card->bio = NULL; 890 890 card->biotail = &card->bio; 891 + spin_lock_init(&card->lock); 891 892 892 - card->queue = blk_alloc_queue(GFP_KERNEL); 893 + card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, 894 + &card->lock); 893 895 if (!card->queue) 894 896 goto failed_alloc; 895 897 896 898 blk_queue_make_request(card->queue, mm_make_request); 897 - card->queue->queue_lock = &card->lock; 898 899 card->queue->queuedata = card; 899 900 900 901 tasklet_init(&card->tasklet, process_page, (unsigned long)card); ··· 968 967 969 968 dev_printk(KERN_INFO, &card->dev->dev, 970 969 "Window size %d bytes, IRQ %d\n", data, dev->irq); 971 - 972 - spin_lock_init(&card->lock); 973 970 974 971 pci_set_drvdata(dev, card); 975 972