Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq

dm_mq_queue_rq() is in atomic context so care must be taken to not
sleep -- as such GFP_ATOMIC is used for the md->bs bioset allocations
and dm-mpath's call to blk_get_request(). In the future the bioset
allocations will hopefully go away (by removing support for partial
completions of bios in a cloned request).

Also prepare for supporting DM blk-mq ontop of old-style request_fn
device(s) if a new dm-mod 'use_blk_mq' parameter is set. The kthread
will still be used to queue work if blk-mq is used ontop of old-style
request_fn device(s).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>

+50 -16
+1 -1
drivers/md/dm-mpath.c
··· 428 428 } else { 429 429 /* blk-mq request-based interface */ 430 430 *__clone = blk_get_request(bdev_get_queue(bdev), 431 - rq_data_dir(rq), GFP_KERNEL); 431 + rq_data_dir(rq), GFP_ATOMIC); 432 432 if (IS_ERR(*__clone)) 433 433 /* ENOMEM, requeue */ 434 434 return r;
+49 -15
drivers/md/dm.c
··· 1077 1077 1078 1078 blk_rq_unprep_clone(clone); 1079 1079 1080 - if (clone->q && clone->q->mq_ops) 1080 + if (clone->q->mq_ops) 1081 1081 tio->ti->type->release_clone_rq(clone); 1082 - else 1082 + else if (!md->queue->mq_ops) 1083 + /* request_fn queue stacked on request_fn queue(s) */ 1083 1084 free_clone_request(md, clone); 1084 1085 1085 1086 if (!md->queue->mq_ops) ··· 1839 1838 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1840 1839 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1841 1840 { 1842 - struct request *clone = alloc_clone_request(md, gfp_mask); 1841 + /* 1842 + * Do not allocate a clone if tio->clone was already set 1843 + * (see: dm_mq_queue_rq). 1844 + */ 1845 + bool alloc_clone = !tio->clone; 1846 + struct request *clone; 1843 1847 1844 - if (!clone) 1845 - return NULL; 1848 + if (alloc_clone) { 1849 + clone = alloc_clone_request(md, gfp_mask); 1850 + if (!clone) 1851 + return NULL; 1852 + } else 1853 + clone = tio->clone; 1846 1854 1847 1855 blk_rq_init(NULL, clone); 1848 1856 if (setup_clone(clone, rq, tio, gfp_mask)) { 1849 1857 /* -ENOMEM */ 1850 - free_clone_request(md, clone); 1858 + if (alloc_clone) 1859 + free_clone_request(md, clone); 1851 1860 return NULL; 1852 1861 } 1853 1862 ··· 1875 1864 tio->orig = rq; 1876 1865 tio->error = 0; 1877 1866 memset(&tio->info, 0, sizeof(tio->info)); 1878 - init_kthread_work(&tio->work, map_tio_request); 1867 + if (md->kworker_task) 1868 + init_kthread_work(&tio->work, map_tio_request); 1879 1869 } 1880 1870 1881 1871 static struct dm_rq_target_io *prep_tio(struct request *rq, ··· 1953 1941 } 1954 1942 if (IS_ERR(clone)) 1955 1943 return DM_MAPIO_REQUEUE; 1956 - if (setup_clone(clone, rq, tio, GFP_NOIO)) { 1944 + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 1957 1945 /* -ENOMEM */ 1958 1946 ti->type->release_clone_rq(clone); 1959 1947 return DM_MAPIO_REQUEUE; ··· 2420 2408 p->bs = NULL; 2421 2409 2422 2410 out: 2423 - /* mempool bind completed, now no need any mempools in the table */ 2411 + /* mempool bind completed, no longer need any mempools in the table */ 2424 2412 dm_table_free_md_mempools(t); 2425 2413 } 2426 2414 ··· 2725 2713 /* Init tio using md established in .init_request */ 2726 2714 init_tio(tio, rq, md); 2727 2715 2728 - /* Establish tio->ti before queuing work (map_tio_request) */ 2716 + /* 2717 + * Establish tio->ti before queuing work (map_tio_request) 2718 + * or making direct call to map_request(). 2719 + */ 2729 2720 tio->ti = ti; 2730 - queue_kthread_work(&md->kworker, &tio->work); 2721 + 2722 + /* Clone the request if underlying devices aren't blk-mq */ 2723 + if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { 2724 + /* clone request is allocated at the end of the pdu */ 2725 + tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); 2726 + if (!clone_rq(rq, md, tio, GFP_ATOMIC)) 2727 + return BLK_MQ_RQ_QUEUE_BUSY; 2728 + queue_kthread_work(&md->kworker, &tio->work); 2729 + } else { 2730 + /* Direct call is fine since .queue_rq allows allocations */ 2731 + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 2732 + dm_requeue_unmapped_original_request(md, rq); 2733 + } 2731 2734 2732 2735 return BLK_MQ_RQ_QUEUE_OK; 2733 2736 } ··· 2756 2729 2757 2730 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) 2758 2731 { 2732 + unsigned md_type = dm_get_md_type(md); 2759 2733 struct request_queue *q; 2760 2734 int err; 2761 2735 ··· 2766 2738 md->tag_set.numa_node = NUMA_NO_NODE; 2767 2739 md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2768 2740 md->tag_set.nr_hw_queues = 1; 2769 - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); 2741 + if (md_type == DM_TYPE_REQUEST_BASED) { 2742 + /* make the memory for non-blk-mq clone part of the pdu */ 2743 + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); 2744 + } else 2745 + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); 2770 2746 md->tag_set.driver_data = md; 2771 2747 2772 2748 err = blk_mq_alloc_tag_set(&md->tag_set); ··· 2788 2756 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 2789 2757 blk_mq_register_disk(md->disk); 2790 2758 2791 - init_rq_based_worker_thread(md); 2759 + if (md_type == DM_TYPE_REQUEST_BASED) 2760 + init_rq_based_worker_thread(md); 2792 2761 2793 2762 return 0; 2794 2763 ··· 2909 2876 set_bit(DMF_FREEING, &md->flags); 2910 2877 spin_unlock(&_minor_lock); 2911 2878 2912 - if (dm_request_based(md)) 2879 + if (dm_request_based(md) && md->kworker_task) 2913 2880 flush_kthread_worker(&md->kworker); 2914 2881 2915 2882 /* ··· 3163 3130 */ 3164 3131 if (dm_request_based(md)) { 3165 3132 stop_queue(md->queue); 3166 - flush_kthread_worker(&md->kworker); 3133 + if (md->kworker_task) 3134 + flush_kthread_worker(&md->kworker); 3167 3135 } 3168 3136 3169 3137 flush_workqueue(md->wq);