Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block layer fixes from Jens Axboe:
"A collection of fixes for this merge window, either fixes for existing
issues, or parts that were waiting for acks to come in. This pull
request contains:

- Allocation of nvme queues on the right node from Shaohua.

This was ready long before the merge window, but waiting on an ack
from Bjorn on the PCI bit. Now that we have that, the three patches
can go in.

- Two fixes for blk-mq-sched with nvmeof, which uses hctx specific
request allocations. This caused an oops. One part from Sagi, one
part from Omar.

- A loop partition scan deadlock fix from Omar, fixing a regression
in this merge window.

- A three-patch series from Keith, closing up a hole on clearing out
requests on shutdown/resume.

- A stable fix for nbd from Josef, fixing a leak of sockets.

- Two fixes for a regression in this window from Jan, fixing a
problem with one of his earlier patches dealing with queue vs bdi
life times.

- A fix for a regression with virtio-blk, causing an IO stall if
scheduling is used. From me.

- A fix for an io context lock ordering problem. From me"

* 'for-linus' of git://git.kernel.dk/linux-block:
block: Move bdi_unregister() to del_gendisk()
blk-mq: ensure that bd->last is always set correctly
block: don't call ioc_exit_icq() with the queue lock held for blk-mq
block: Initialize bd_bdi on inode initialization
loop: fix LO_FLAGS_PARTSCAN hang
nvme: Complete all stuck requests
blk-mq: Provide freeze queue timeout
blk-mq: Export blk_mq_freeze_queue_wait
nbd: stop leaking sockets
blk-mq: move update of tags->rqs to __blk_mq_alloc_request()
blk-mq: kill blk_mq_set_alloc_data()
blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request
blk-mq-sched: Allocate sched reserved tags as specified in the original queue tagset
nvme: allocate nvme_queue in correct node
PCI: add an API to get node from vector
blk-mq: allocate blk_mq_tags and requests in correct node

+265 -89
-1
block/blk-core.c
··· 578 578 q->queue_lock = &q->__queue_lock; 579 579 spin_unlock_irq(lock); 580 580 581 - bdi_unregister(q->backing_dev_info); 582 581 put_disk_devt(q->disk_devt); 583 582 584 583 /* @q is and will stay empty, shutdown and put */
+31 -13
block/blk-ioc.c
··· 37 37 } 38 38 39 39 /* 40 - * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for 41 - * mq. 40 + * Exit an icq. Called with ioc locked for blk-mq, and with both ioc 41 + * and queue locked for legacy. 42 42 */ 43 43 static void ioc_exit_icq(struct io_cq *icq) 44 44 { ··· 55 55 icq->flags |= ICQ_EXITED; 56 56 } 57 57 58 - /* Release an icq. Called with both ioc and q locked. */ 58 + /* 59 + * Release an icq. Called with ioc locked for blk-mq, and with both ioc 60 + * and queue locked for legacy. 61 + */ 59 62 static void ioc_destroy_icq(struct io_cq *icq) 60 63 { 61 64 struct io_context *ioc = icq->ioc; ··· 66 63 struct elevator_type *et = q->elevator->type; 67 64 68 65 lockdep_assert_held(&ioc->lock); 69 - lockdep_assert_held(q->queue_lock); 70 66 71 67 radix_tree_delete(&ioc->icq_tree, icq->q->id); 72 68 hlist_del_init(&icq->ioc_node); ··· 225 223 put_io_context_active(ioc); 226 224 } 227 225 226 + static void __ioc_clear_queue(struct list_head *icq_list) 227 + { 228 + unsigned long flags; 229 + 230 + while (!list_empty(icq_list)) { 231 + struct io_cq *icq = list_entry(icq_list->next, 232 + struct io_cq, q_node); 233 + struct io_context *ioc = icq->ioc; 234 + 235 + spin_lock_irqsave(&ioc->lock, flags); 236 + ioc_destroy_icq(icq); 237 + spin_unlock_irqrestore(&ioc->lock, flags); 238 + } 239 + } 240 + 228 241 /** 229 242 * ioc_clear_queue - break any ioc association with the specified queue 230 243 * @q: request_queue being cleared 231 244 * 232 - * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. 245 + * Walk @q->icq_list and exit all io_cq's. 233 246 */ 234 247 void ioc_clear_queue(struct request_queue *q) 235 248 { 236 - lockdep_assert_held(q->queue_lock); 249 + LIST_HEAD(icq_list); 237 250 238 - while (!list_empty(&q->icq_list)) { 239 - struct io_cq *icq = list_entry(q->icq_list.next, 240 - struct io_cq, q_node); 241 - struct io_context *ioc = icq->ioc; 251 + spin_lock_irq(q->queue_lock); 252 + list_splice_init(&q->icq_list, &icq_list); 242 253 243 - spin_lock(&ioc->lock); 244 - ioc_destroy_icq(icq); 245 - spin_unlock(&ioc->lock); 254 + if (q->mq_ops) { 255 + spin_unlock_irq(q->queue_lock); 256 + __ioc_clear_queue(&icq_list); 257 + } else { 258 + __ioc_clear_queue(&icq_list); 259 + spin_unlock_irq(q->queue_lock); 246 260 } 247 261 } 248 262
+7 -9
block/blk-mq-sched.c
··· 110 110 struct blk_mq_alloc_data *data) 111 111 { 112 112 struct elevator_queue *e = q->elevator; 113 - struct blk_mq_hw_ctx *hctx; 114 - struct blk_mq_ctx *ctx; 115 113 struct request *rq; 116 114 117 115 blk_queue_enter_live(q); 118 - ctx = blk_mq_get_ctx(q); 119 - hctx = blk_mq_map_queue(q, ctx->cpu); 120 - 121 - blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); 116 + data->q = q; 117 + if (likely(!data->ctx)) 118 + data->ctx = blk_mq_get_ctx(q); 119 + if (likely(!data->hctx)) 120 + data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 122 121 123 122 if (e) { 124 123 data->flags |= BLK_MQ_REQ_INTERNAL; ··· 134 135 rq = __blk_mq_alloc_request(data, op); 135 136 } else { 136 137 rq = __blk_mq_alloc_request(data, op); 137 - if (rq) 138 - data->hctx->tags->rqs[rq->tag] = rq; 139 138 } 140 139 141 140 if (rq) { ··· 451 454 */ 452 455 ret = 0; 453 456 queue_for_each_hw_ctx(q, hctx, i) { 454 - hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); 457 + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, 458 + q->nr_requests, set->reserved_tags); 455 459 if (!hctx->sched_tags) { 456 460 ret = -ENOMEM; 457 461 break;
+1 -1
block/blk-mq-tag.c
··· 181 181 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, 182 182 struct blk_mq_ctx *ctx, unsigned int tag) 183 183 { 184 - if (tag >= tags->nr_reserved_tags) { 184 + if (!blk_mq_tag_is_reserved(tags, tag)) { 185 185 const int real_tag = tag - tags->nr_reserved_tags; 186 186 187 187 BUG_ON(real_tag >= tags->nr_tags);
+6
block/blk-mq-tag.h
··· 85 85 hctx->tags->rqs[tag] = rq; 86 86 } 87 87 88 + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, 89 + unsigned int tag) 90 + { 91 + return tag < tags->nr_reserved_tags; 92 + } 93 + 88 94 #endif
+88 -32
block/blk-mq.c
··· 77 77 } 78 78 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 79 79 80 - static void blk_mq_freeze_queue_wait(struct request_queue *q) 80 + void blk_mq_freeze_queue_wait(struct request_queue *q) 81 81 { 82 82 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 83 83 } 84 + EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 85 + 86 + int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 87 + unsigned long timeout) 88 + { 89 + return wait_event_timeout(q->mq_freeze_wq, 90 + percpu_ref_is_zero(&q->q_usage_counter), 91 + timeout); 92 + } 93 + EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 84 94 85 95 /* 86 96 * Guarantee no request is in use, so we can change any data structure of ··· 246 236 } 247 237 rq->tag = tag; 248 238 rq->internal_tag = -1; 239 + data->hctx->tags->rqs[rq->tag] = rq; 249 240 } 250 241 251 242 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); ··· 286 275 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, 287 276 unsigned int flags, unsigned int hctx_idx) 288 277 { 289 - struct blk_mq_hw_ctx *hctx; 290 - struct blk_mq_ctx *ctx; 278 + struct blk_mq_alloc_data alloc_data = { .flags = flags }; 291 279 struct request *rq; 292 - struct blk_mq_alloc_data alloc_data; 280 + unsigned int cpu; 293 281 int ret; 294 282 295 283 /* ··· 311 301 * Check if the hardware context is actually mapped to anything. 312 302 * If not tell the caller that it should skip this queue. 313 303 */ 314 - hctx = q->queue_hw_ctx[hctx_idx]; 315 - if (!blk_mq_hw_queue_mapped(hctx)) { 316 - ret = -EXDEV; 317 - goto out_queue_exit; 304 + alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; 305 + if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { 306 + blk_queue_exit(q); 307 + return ERR_PTR(-EXDEV); 318 308 } 319 - ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); 309 + cpu = cpumask_first(alloc_data.hctx->cpumask); 310 + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 320 311 321 - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 322 - rq = __blk_mq_alloc_request(&alloc_data, rw); 323 - if (!rq) { 324 - ret = -EWOULDBLOCK; 325 - goto out_queue_exit; 326 - } 312 + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); 313 + 314 + blk_mq_put_ctx(alloc_data.ctx); 315 + blk_queue_exit(q); 316 + 317 + if (!rq) 318 + return ERR_PTR(-EWOULDBLOCK); 327 319 328 320 return rq; 329 - 330 - out_queue_exit: 331 - blk_queue_exit(q); 332 - return ERR_PTR(ret); 333 321 } 334 322 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 335 323 ··· 862 854 return true; 863 855 } 864 856 857 + if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 858 + data.flags |= BLK_MQ_REQ_RESERVED; 859 + 865 860 rq->tag = blk_mq_get_tag(&data); 866 861 if (rq->tag >= 0) { 867 862 if (blk_mq_tag_busy(data.hctx)) { ··· 878 867 return false; 879 868 } 880 869 881 - static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 882 - struct request *rq) 870 + static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 871 + struct request *rq) 883 872 { 884 - if (rq->tag == -1 || rq->internal_tag == -1) 885 - return; 886 - 887 873 blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); 888 874 rq->tag = -1; 889 875 ··· 888 880 rq->rq_flags &= ~RQF_MQ_INFLIGHT; 889 881 atomic_dec(&hctx->nr_active); 890 882 } 883 + } 884 + 885 + static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, 886 + struct request *rq) 887 + { 888 + if (rq->tag == -1 || rq->internal_tag == -1) 889 + return; 890 + 891 + __blk_mq_put_driver_tag(hctx, rq); 892 + } 893 + 894 + static void blk_mq_put_driver_tag(struct request *rq) 895 + { 896 + struct blk_mq_hw_ctx *hctx; 897 + 898 + if (rq->tag == -1 || rq->internal_tag == -1) 899 + return; 900 + 901 + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 902 + __blk_mq_put_driver_tag(hctx, rq); 891 903 } 892 904 893 905 /* ··· 1019 991 1020 992 bd.rq = rq; 1021 993 bd.list = dptr; 1022 - bd.last = list_empty(list); 994 + 995 + /* 996 + * Flag last if we have no more requests, or if we have more 997 + * but can't assign a driver tag to it. 998 + */ 999 + if (list_empty(list)) 1000 + bd.last = true; 1001 + else { 1002 + struct request *nxt; 1003 + 1004 + nxt = list_first_entry(list, struct request, queuelist); 1005 + bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); 1006 + } 1023 1007 1024 1008 ret = q->mq_ops->queue_rq(hctx, &bd); 1025 1009 switch (ret) { ··· 1039 999 queued++; 1040 1000 break; 1041 1001 case BLK_MQ_RQ_QUEUE_BUSY: 1042 - blk_mq_put_driver_tag(hctx, rq); 1002 + blk_mq_put_driver_tag_hctx(hctx, rq); 1043 1003 list_add(&rq->queuelist, list); 1044 1004 __blk_mq_requeue_request(rq); 1045 1005 break; ··· 1069 1029 * that is where we will continue on next queue run. 1070 1030 */ 1071 1031 if (!list_empty(list)) { 1032 + /* 1033 + * If we got a driver tag for the next request already, 1034 + * free it again. 1035 + */ 1036 + rq = list_first_entry(list, struct request, queuelist); 1037 + blk_mq_put_driver_tag(rq); 1038 + 1072 1039 spin_lock(&hctx->lock); 1073 1040 list_splice_init(list, &hctx->dispatch); 1074 1041 spin_unlock(&hctx->lock); ··· 1762 1715 unsigned int reserved_tags) 1763 1716 { 1764 1717 struct blk_mq_tags *tags; 1718 + int node; 1765 1719 1766 - tags = blk_mq_init_tags(nr_tags, reserved_tags, 1767 - set->numa_node, 1720 + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1721 + if (node == NUMA_NO_NODE) 1722 + node = set->numa_node; 1723 + 1724 + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 1768 1725 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1769 1726 if (!tags) 1770 1727 return NULL; 1771 1728 1772 1729 tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1773 1730 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1774 - set->numa_node); 1731 + node); 1775 1732 if (!tags->rqs) { 1776 1733 blk_mq_free_tags(tags); 1777 1734 return NULL; ··· 1783 1732 1784 1733 tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1785 1734 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1786 - set->numa_node); 1735 + node); 1787 1736 if (!tags->static_rqs) { 1788 1737 kfree(tags->rqs); 1789 1738 blk_mq_free_tags(tags); ··· 1803 1752 { 1804 1753 unsigned int i, j, entries_per_page, max_order = 4; 1805 1754 size_t rq_size, left; 1755 + int node; 1756 + 1757 + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1758 + if (node == NUMA_NO_NODE) 1759 + node = set->numa_node; 1806 1760 1807 1761 INIT_LIST_HEAD(&tags->page_list); 1808 1762 ··· 1829 1773 this_order--; 1830 1774 1831 1775 do { 1832 - page = alloc_pages_node(set->numa_node, 1776 + page = alloc_pages_node(node, 1833 1777 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1834 1778 this_order); 1835 1779 if (page) ··· 1862 1806 if (set->ops->init_request) { 1863 1807 if (set->ops->init_request(set->driver_data, 1864 1808 rq, hctx_idx, i, 1865 - set->numa_node)) { 1809 + node)) { 1866 1810 tags->static_rqs[i] = NULL; 1867 1811 goto fail; 1868 1812 }
-10
block/blk-mq.h
··· 146 146 struct blk_mq_hw_ctx *hctx; 147 147 }; 148 148 149 - static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, 150 - struct request_queue *q, unsigned int flags, 151 - struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) 152 - { 153 - data->q = q; 154 - data->flags = flags; 155 - data->ctx = ctx; 156 - data->hctx = hctx; 157 - } 158 - 159 149 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) 160 150 { 161 151 if (data->flags & BLK_MQ_REQ_INTERNAL)
-2
block/blk-sysfs.c
··· 815 815 blkcg_exit_queue(q); 816 816 817 817 if (q->elevator) { 818 - spin_lock_irq(q->queue_lock); 819 818 ioc_clear_queue(q); 820 - spin_unlock_irq(q->queue_lock); 821 819 elevator_exit(q->elevator); 822 820 } 823 821
-2
block/elevator.c
··· 983 983 if (old_registered) 984 984 elv_unregister_queue(q); 985 985 986 - spin_lock_irq(q->queue_lock); 987 986 ioc_clear_queue(q); 988 - spin_unlock_irq(q->queue_lock); 989 987 } 990 988 991 989 /* allocate, init and register new elevator */
+5
block/genhd.c
··· 681 681 disk->flags &= ~GENHD_FL_UP; 682 682 683 683 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 684 + /* 685 + * Unregister bdi before releasing device numbers (as they can get 686 + * reused and we'd get clashes in sysfs). 687 + */ 688 + bdi_unregister(disk->queue->backing_dev_info); 684 689 blk_unregister_queue(disk); 685 690 blk_unregister_region(disk_devt(disk), disk->minors); 686 691
+8 -7
drivers/block/loop.c
··· 1142 1142 (info->lo_flags & LO_FLAGS_AUTOCLEAR)) 1143 1143 lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; 1144 1144 1145 - if ((info->lo_flags & LO_FLAGS_PARTSCAN) && 1146 - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { 1147 - lo->lo_flags |= LO_FLAGS_PARTSCAN; 1148 - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; 1149 - loop_reread_partitions(lo, lo->lo_device); 1150 - } 1151 - 1152 1145 lo->lo_encrypt_key_size = info->lo_encrypt_key_size; 1153 1146 lo->lo_init[0] = info->lo_init[0]; 1154 1147 lo->lo_init[1] = info->lo_init[1]; ··· 1156 1163 1157 1164 exit: 1158 1165 blk_mq_unfreeze_queue(lo->lo_queue); 1166 + 1167 + if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && 1168 + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { 1169 + lo->lo_flags |= LO_FLAGS_PARTSCAN; 1170 + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; 1171 + loop_reread_partitions(lo, lo->lo_device); 1172 + } 1173 + 1159 1174 return err; 1160 1175 } 1161 1176
+3 -1
drivers/block/nbd.c
··· 675 675 nbd->num_connections) { 676 676 int i; 677 677 678 - for (i = 0; i < nbd->num_connections; i++) 678 + for (i = 0; i < nbd->num_connections; i++) { 679 + sockfd_put(nbd->socks[i]->sock); 679 680 kfree(nbd->socks[i]); 681 + } 680 682 kfree(nbd->socks); 681 683 nbd->socks = NULL; 682 684 nbd->num_connections = 0;
+47
drivers/nvme/host/core.c
··· 2344 2344 } 2345 2345 EXPORT_SYMBOL_GPL(nvme_kill_queues); 2346 2346 2347 + void nvme_unfreeze(struct nvme_ctrl *ctrl) 2348 + { 2349 + struct nvme_ns *ns; 2350 + 2351 + mutex_lock(&ctrl->namespaces_mutex); 2352 + list_for_each_entry(ns, &ctrl->namespaces, list) 2353 + blk_mq_unfreeze_queue(ns->queue); 2354 + mutex_unlock(&ctrl->namespaces_mutex); 2355 + } 2356 + EXPORT_SYMBOL_GPL(nvme_unfreeze); 2357 + 2358 + void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 2359 + { 2360 + struct nvme_ns *ns; 2361 + 2362 + mutex_lock(&ctrl->namespaces_mutex); 2363 + list_for_each_entry(ns, &ctrl->namespaces, list) { 2364 + timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); 2365 + if (timeout <= 0) 2366 + break; 2367 + } 2368 + mutex_unlock(&ctrl->namespaces_mutex); 2369 + } 2370 + EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 2371 + 2372 + void nvme_wait_freeze(struct nvme_ctrl *ctrl) 2373 + { 2374 + struct nvme_ns *ns; 2375 + 2376 + mutex_lock(&ctrl->namespaces_mutex); 2377 + list_for_each_entry(ns, &ctrl->namespaces, list) 2378 + blk_mq_freeze_queue_wait(ns->queue); 2379 + mutex_unlock(&ctrl->namespaces_mutex); 2380 + } 2381 + EXPORT_SYMBOL_GPL(nvme_wait_freeze); 2382 + 2383 + void nvme_start_freeze(struct nvme_ctrl *ctrl) 2384 + { 2385 + struct nvme_ns *ns; 2386 + 2387 + mutex_lock(&ctrl->namespaces_mutex); 2388 + list_for_each_entry(ns, &ctrl->namespaces, list) 2389 + blk_mq_freeze_queue_start(ns->queue); 2390 + mutex_unlock(&ctrl->namespaces_mutex); 2391 + } 2392 + EXPORT_SYMBOL_GPL(nvme_start_freeze); 2393 + 2347 2394 void nvme_stop_queues(struct nvme_ctrl *ctrl) 2348 2395 { 2349 2396 struct nvme_ns *ns;
+4
drivers/nvme/host/nvme.h
··· 294 294 void nvme_stop_queues(struct nvme_ctrl *ctrl); 295 295 void nvme_start_queues(struct nvme_ctrl *ctrl); 296 296 void nvme_kill_queues(struct nvme_ctrl *ctrl); 297 + void nvme_unfreeze(struct nvme_ctrl *ctrl); 298 + void nvme_wait_freeze(struct nvme_ctrl *ctrl); 299 + void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); 300 + void nvme_start_freeze(struct nvme_ctrl *ctrl); 297 301 298 302 #define NVME_QID_ANY -1 299 303 struct request *nvme_alloc_request(struct request_queue *q,
+36 -9
drivers/nvme/host/pci.c
··· 1038 1038 } 1039 1039 1040 1040 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1041 - int depth) 1041 + int depth, int node) 1042 1042 { 1043 - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1043 + struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, 1044 + node); 1044 1045 if (!nvmeq) 1045 1046 return NULL; 1046 1047 ··· 1218 1217 1219 1218 nvmeq = dev->queues[0]; 1220 1219 if (!nvmeq) { 1221 - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1220 + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 1221 + dev_to_node(dev->dev)); 1222 1222 if (!nvmeq) 1223 1223 return -ENOMEM; 1224 1224 } ··· 1311 1309 int ret = 0; 1312 1310 1313 1311 for (i = dev->queue_count; i <= dev->max_qid; i++) { 1314 - if (!nvme_alloc_queue(dev, i, dev->q_depth)) { 1312 + /* vector == qid - 1, match nvme_create_queue */ 1313 + if (!nvme_alloc_queue(dev, i, dev->q_depth, 1314 + pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { 1315 1315 ret = -ENOMEM; 1316 1316 break; 1317 1317 } ··· 1675 1671 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 1676 1672 { 1677 1673 int i, queues; 1678 - u32 csts = -1; 1674 + bool dead = true; 1675 + struct pci_dev *pdev = to_pci_dev(dev->dev); 1679 1676 1680 1677 del_timer_sync(&dev->watchdog_timer); 1681 1678 1682 1679 mutex_lock(&dev->shutdown_lock); 1683 - if (pci_is_enabled(to_pci_dev(dev->dev))) { 1684 - nvme_stop_queues(&dev->ctrl); 1685 - csts = readl(dev->bar + NVME_REG_CSTS); 1680 + if (pci_is_enabled(pdev)) { 1681 + u32 csts = readl(dev->bar + NVME_REG_CSTS); 1682 + 1683 + if (dev->ctrl.state == NVME_CTRL_LIVE) 1684 + nvme_start_freeze(&dev->ctrl); 1685 + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || 1686 + pdev->error_state != pci_channel_io_normal); 1686 1687 } 1688 + 1689 + /* 1690 + * Give the controller a chance to complete all entered requests if 1691 + * doing a safe shutdown. 1692 + */ 1693 + if (!dead && shutdown) 1694 + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); 1695 + nvme_stop_queues(&dev->ctrl); 1687 1696 1688 1697 queues = dev->online_queues - 1; 1689 1698 for (i = dev->queue_count - 1; i > 0; i--) 1690 1699 nvme_suspend_queue(dev->queues[i]); 1691 1700 1692 - if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 1701 + if (dead) { 1693 1702 /* A device might become IO incapable very soon during 1694 1703 * probe, before the admin queue is configured. Thus, 1695 1704 * queue_count can be 0 here. ··· 1717 1700 1718 1701 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); 1719 1702 blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); 1703 + 1704 + /* 1705 + * The driver will not be starting up queues again if shutting down so 1706 + * must flush all entered requests to their failed completion to avoid 1707 + * deadlocking blk-mq hot-cpu notifier. 1708 + */ 1709 + if (shutdown) 1710 + nvme_start_queues(&dev->ctrl); 1720 1711 mutex_unlock(&dev->shutdown_lock); 1721 1712 } 1722 1713 ··· 1847 1822 nvme_remove_namespaces(&dev->ctrl); 1848 1823 } else { 1849 1824 nvme_start_queues(&dev->ctrl); 1825 + nvme_wait_freeze(&dev->ctrl); 1850 1826 nvme_dev_add(dev); 1827 + nvme_unfreeze(&dev->ctrl); 1851 1828 } 1852 1829 1853 1830 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+16
drivers/pci/msi.c
··· 1298 1298 } 1299 1299 EXPORT_SYMBOL(pci_irq_get_affinity); 1300 1300 1301 + /** 1302 + * pci_irq_get_node - return the numa node of a particular msi vector 1303 + * @pdev: PCI device to operate on 1304 + * @vec: device-relative interrupt vector index (0-based). 1305 + */ 1306 + int pci_irq_get_node(struct pci_dev *pdev, int vec) 1307 + { 1308 + const struct cpumask *mask; 1309 + 1310 + mask = pci_irq_get_affinity(pdev, vec); 1311 + if (mask) 1312 + return local_memory_node(cpu_to_node(cpumask_first(mask))); 1313 + return dev_to_node(&pdev->dev); 1314 + } 1315 + EXPORT_SYMBOL(pci_irq_get_node); 1316 + 1301 1317 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) 1302 1318 { 1303 1319 return to_pci_dev(desc->dev);
+4 -2
fs/block_dev.c
··· 870 870 #ifdef CONFIG_SYSFS 871 871 INIT_LIST_HEAD(&bdev->bd_holder_disks); 872 872 #endif 873 + bdev->bd_bdi = &noop_backing_dev_info; 873 874 inode_init_once(&ei->vfs_inode); 874 875 /* Initialize mutex for freeze. */ 875 876 mutex_init(&bdev->bd_fsfreeze_mutex); ··· 885 884 spin_lock(&bdev_lock); 886 885 list_del_init(&bdev->bd_list); 887 886 spin_unlock(&bdev_lock); 888 - if (bdev->bd_bdi != &noop_backing_dev_info) 887 + if (bdev->bd_bdi != &noop_backing_dev_info) { 889 888 bdi_put(bdev->bd_bdi); 889 + bdev->bd_bdi = &noop_backing_dev_info; 890 + } 890 891 } 891 892 892 893 static const struct super_operations bdev_sops = { ··· 991 988 bdev->bd_contains = NULL; 992 989 bdev->bd_super = NULL; 993 990 bdev->bd_inode = inode; 994 - bdev->bd_bdi = &noop_backing_dev_info; 995 991 bdev->bd_block_size = i_blocksize(inode); 996 992 bdev->bd_part_count = 0; 997 993 bdev->bd_invalidated = 0;
+3
include/linux/blk-mq.h
··· 245 245 void blk_mq_freeze_queue(struct request_queue *q); 246 246 void blk_mq_unfreeze_queue(struct request_queue *q); 247 247 void blk_mq_freeze_queue_start(struct request_queue *q); 248 + void blk_mq_freeze_queue_wait(struct request_queue *q); 249 + int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 250 + unsigned long timeout); 248 251 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); 249 252 250 253 int blk_mq_map_queues(struct blk_mq_tag_set *set);
+6
include/linux/pci.h
··· 1323 1323 void pci_free_irq_vectors(struct pci_dev *dev); 1324 1324 int pci_irq_vector(struct pci_dev *dev, unsigned int nr); 1325 1325 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); 1326 + int pci_irq_get_node(struct pci_dev *pdev, int vec); 1326 1327 1327 1328 #else 1328 1329 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } ··· 1370 1369 int vec) 1371 1370 { 1372 1371 return cpu_possible_mask; 1372 + } 1373 + 1374 + static inline int pci_irq_get_node(struct pci_dev *pdev, int vec) 1375 + { 1376 + return first_online_node; 1373 1377 } 1374 1378 #endif 1375 1379