Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-4.9/block' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe:
"This is the main pull request for block layer changes in 4.9.

As mentioned at the last merge window, I've changed things up and now
do just one branch for core block layer changes, and driver changes.
This avoids dependencies between the two branches. Outside of this
main pull request, there are two topical branches coming as well.

This pull request contains:

- A set of fixes, and a conversion to blk-mq, of nbd. From Josef.

- Set of fixes and updates for lightnvm from Matias, Simon, and Arnd.
Followup dependency fix from Geert.

- General fixes from Bart, Baoyou, Guoqing, and Linus W.

- CFQ async write starvation fix from Glauber.

- Add supprot for delayed kick of the requeue list, from Mike.

- Pull out the scalable bitmap code from blk-mq-tag.c and make it
generally available under the name of sbitmap. Only blk-mq-tag uses
it for now, but the blk-mq scheduling bits will use it as well.
From Omar.

- bdev thaw error progagation from Pierre.

- Improve the blk polling statistics, and allow the user to clear
them. From Stephen.

- Set of minor cleanups from Christoph in block/blk-mq.

- Set of cleanups and optimizations from me for block/blk-mq.

- Various nvme/nvmet/nvmeof fixes from the various folks"

* 'for-4.9/block' of git://git.kernel.dk/linux-block: (54 commits)
fs/block_dev.c: return the right error in thaw_bdev()
nvme: Pass pointers, not dma addresses, to nvme_get/set_features()
nvme/scsi: Remove power management support
nvmet: Make dsm number of ranges zero based
nvmet: Use direct IO for writes
admin-cmd: Added smart-log command support.
nvme-fabrics: Add host_traddr options field to host infrastructure
nvme-fabrics: revise host transport option descriptions
nvme-fabrics: rework nvmf_get_address() for variable options
nbd: use BLK_MQ_F_BLOCKING
blkcg: Annotate blkg_hint correctly
cfq: fix starvation of asynchronous writes
blk-mq: add flag for drivers wanting blocking ->queue_rq()
blk-mq: remove non-blocking pass in blk_mq_map_request
blk-mq: get rid of manual run of queue with __blk_mq_run_hw_queue()
block: export bio_free_pages to other modules
lightnvm: propagate device_add() error code
lightnvm: expose device geometry through sysfs
lightnvm: control life of nvm_dev in driver
blk-mq: register device instead of disk
...

+1829 -1171
+2 -2
Documentation/block/biodoc.txt
··· 115 115 116 116 Various parameters that the generic i/o scheduler logic uses are set at 117 117 a per-queue level (e.g maximum request size, maximum number of segments in 118 - a scatter-gather list, hardsect size) 118 + a scatter-gather list, logical block size) 119 119 120 120 Some parameters that were earlier available as global arrays indexed by 121 121 major/minor are now directly associated with the queue. Some of these may ··· 156 156 blk_queue_max_segment_size(q, max_seg_size) 157 157 Maximum size of a clustered segment, 64kB default. 158 158 159 - blk_queue_hardsect_size(q, hardsect_size) 159 + blk_queue_logical_block_size(q, logical_block_size) 160 160 Lowest possible sector size that the hardware can operate 161 161 on, 512 bytes default. 162 162
+1
MAINTAINERS
··· 2472 2472 S: Maintained 2473 2473 F: block/ 2474 2474 F: kernel/trace/blktrace.c 2475 + F: lib/sbitmap.c 2475 2476 2476 2477 BLOCK2MTD DRIVER 2477 2478 M: Joern Engel <joern@lazybastard.org>
+1
block/Kconfig
··· 4 4 menuconfig BLOCK 5 5 bool "Enable the block layer" if EXPERT 6 6 default y 7 + select SBITMAP 7 8 help 8 9 Provide block layer support for the kernel. 9 10
+3 -2
block/bio.c
··· 1068 1068 return 0; 1069 1069 } 1070 1070 1071 - static void bio_free_pages(struct bio *bio) 1071 + void bio_free_pages(struct bio *bio) 1072 1072 { 1073 1073 struct bio_vec *bvec; 1074 1074 int i; ··· 1076 1076 bio_for_each_segment_all(bvec, bio, i) 1077 1077 __free_page(bvec->bv_page); 1078 1078 } 1079 + EXPORT_SYMBOL(bio_free_pages); 1079 1080 1080 1081 /** 1081 1082 * bio_uncopy_user - finish previously mapped bio ··· 1275 1274 1276 1275 nr_pages += end - start; 1277 1276 /* 1278 - * buffer must be aligned to at least hardsector size for now 1277 + * buffer must be aligned to at least logical block size for now 1279 1278 */ 1280 1279 if (uaddr & queue_dma_alignment(q)) 1281 1280 return ERR_PTR(-EINVAL);
+13 -3
block/blk-core.c
··· 288 288 int i; 289 289 290 290 queue_for_each_hw_ctx(q, hctx, i) { 291 - cancel_delayed_work_sync(&hctx->run_work); 291 + cancel_work_sync(&hctx->run_work); 292 292 cancel_delayed_work_sync(&hctx->delay_work); 293 293 } 294 294 } else { ··· 3097 3097 } 3098 3098 EXPORT_SYMBOL(kblockd_schedule_work); 3099 3099 3100 + int kblockd_schedule_work_on(int cpu, struct work_struct *work) 3101 + { 3102 + return queue_work_on(cpu, kblockd_workqueue, work); 3103 + } 3104 + EXPORT_SYMBOL(kblockd_schedule_work_on); 3105 + 3100 3106 int kblockd_schedule_delayed_work(struct delayed_work *dwork, 3101 3107 unsigned long delay) 3102 3108 { ··· 3307 3301 { 3308 3302 struct blk_plug *plug; 3309 3303 long state; 3304 + unsigned int queue_num; 3305 + struct blk_mq_hw_ctx *hctx; 3310 3306 3311 3307 if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || 3312 3308 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 3313 3309 return false; 3310 + 3311 + queue_num = blk_qc_t_to_queue_num(cookie); 3312 + hctx = q->queue_hw_ctx[queue_num]; 3313 + hctx->poll_considered++; 3314 3314 3315 3315 plug = current->plug; 3316 3316 if (plug) ··· 3324 3312 3325 3313 state = current->state; 3326 3314 while (!need_resched()) { 3327 - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); 3328 - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; 3329 3315 int ret; 3330 3316 3331 3317 hctx->poll_invoked++;
+25 -15
block/blk-mq-sysfs.c
··· 176 176 177 177 static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) 178 178 { 179 - return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); 179 + return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", 180 + hctx->poll_considered, hctx->poll_invoked, 181 + hctx->poll_success); 182 + } 183 + 184 + static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, 185 + const char *page, size_t size) 186 + { 187 + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; 188 + 189 + return size; 180 190 } 181 191 182 192 static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, ··· 208 198 209 199 page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); 210 200 211 - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { 212 - unsigned long d = 1U << (i - 1); 201 + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { 202 + unsigned int d = 1U << (i - 1); 213 203 214 - page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); 204 + page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); 215 205 } 216 206 207 + page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), 208 + hctx->dispatched[i]); 217 209 return page - start_page; 218 210 } 219 211 ··· 313 301 .show = blk_mq_hw_sysfs_cpus_show, 314 302 }; 315 303 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { 316 - .attr = {.name = "io_poll", .mode = S_IRUGO }, 304 + .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, 317 305 .show = blk_mq_hw_sysfs_poll_show, 306 + .store = blk_mq_hw_sysfs_poll_store, 318 307 }; 319 308 320 309 static struct attribute *default_hw_ctx_attrs[] = { ··· 393 380 return ret; 394 381 } 395 382 396 - static void __blk_mq_unregister_disk(struct gendisk *disk) 383 + static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) 397 384 { 398 - struct request_queue *q = disk->queue; 399 385 struct blk_mq_hw_ctx *hctx; 400 386 struct blk_mq_ctx *ctx; 401 387 int i, j; ··· 412 400 kobject_del(&q->mq_kobj); 413 401 kobject_put(&q->mq_kobj); 414 402 415 - kobject_put(&disk_to_dev(disk)->kobj); 403 + kobject_put(&dev->kobj); 416 404 417 405 q->mq_sysfs_init_done = false; 418 406 } 419 407 420 - void blk_mq_unregister_disk(struct gendisk *disk) 408 + void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) 421 409 { 422 410 blk_mq_disable_hotplug(); 423 - __blk_mq_unregister_disk(disk); 411 + __blk_mq_unregister_dev(dev, q); 424 412 blk_mq_enable_hotplug(); 425 413 } 426 414 ··· 442 430 } 443 431 } 444 432 445 - int blk_mq_register_disk(struct gendisk *disk) 433 + int blk_mq_register_dev(struct device *dev, struct request_queue *q) 446 434 { 447 - struct device *dev = disk_to_dev(disk); 448 - struct request_queue *q = disk->queue; 449 435 struct blk_mq_hw_ctx *hctx; 450 436 int ret, i; 451 437 ··· 464 454 } 465 455 466 456 if (ret) 467 - __blk_mq_unregister_disk(disk); 457 + __blk_mq_unregister_dev(dev, q); 468 458 else 469 459 q->mq_sysfs_init_done = true; 470 460 out: ··· 472 462 473 463 return ret; 474 464 } 475 - EXPORT_SYMBOL_GPL(blk_mq_register_disk); 465 + EXPORT_SYMBOL_GPL(blk_mq_register_dev); 476 466 477 467 void blk_mq_sysfs_unregister(struct request_queue *q) 478 468 {
+115 -388
block/blk-mq-tag.c
··· 1 1 /* 2 - * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread 3 - * over multiple cachelines to avoid ping-pong between multiple submitters 4 - * or submitter and completer. Uses rolling wakeups to avoid falling of 5 - * the scaling cliff when we run out of tags and have to start putting 6 - * submitters to sleep. 7 - * 8 - * Uses active queue tracking to support fairer distribution of tags 9 - * between multiple submitters when a shared tag map is used. 2 + * Tag allocation using scalable bitmaps. Uses active queue tracking to support 3 + * fairer distribution of tags between multiple submitters when a shared tag map 4 + * is used. 10 5 * 11 6 * Copyright (C) 2013-2014 Jens Axboe 12 7 */ 13 8 #include <linux/kernel.h> 14 9 #include <linux/module.h> 15 - #include <linux/random.h> 16 10 17 11 #include <linux/blk-mq.h> 18 12 #include "blk.h" 19 13 #include "blk-mq.h" 20 14 #include "blk-mq-tag.h" 21 15 22 - static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) 23 - { 24 - int i; 25 - 26 - for (i = 0; i < bt->map_nr; i++) { 27 - struct blk_align_bitmap *bm = &bt->map[i]; 28 - int ret; 29 - 30 - ret = find_first_zero_bit(&bm->word, bm->depth); 31 - if (ret < bm->depth) 32 - return true; 33 - } 34 - 35 - return false; 36 - } 37 - 38 16 bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 39 17 { 40 18 if (!tags) 41 19 return true; 42 20 43 - return bt_has_free_tags(&tags->bitmap_tags); 44 - } 45 - 46 - static inline int bt_index_inc(int index) 47 - { 48 - return (index + 1) & (BT_WAIT_QUEUES - 1); 49 - } 50 - 51 - static inline void bt_index_atomic_inc(atomic_t *index) 52 - { 53 - int old = atomic_read(index); 54 - int new = bt_index_inc(old); 55 - atomic_cmpxchg(index, old, new); 21 + return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); 56 22 } 57 23 58 24 /* ··· 38 72 */ 39 73 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 40 74 { 41 - struct blk_mq_bitmap_tags *bt; 42 - int i, wake_index; 43 - 44 - /* 45 - * Make sure all changes prior to this are visible from other CPUs. 46 - */ 47 - smp_mb(); 48 - bt = &tags->bitmap_tags; 49 - wake_index = atomic_read(&bt->wake_index); 50 - for (i = 0; i < BT_WAIT_QUEUES; i++) { 51 - struct bt_wait_state *bs = &bt->bs[wake_index]; 52 - 53 - if (waitqueue_active(&bs->wait)) 54 - wake_up(&bs->wait); 55 - 56 - wake_index = bt_index_inc(wake_index); 57 - } 58 - 59 - if (include_reserve) { 60 - bt = &tags->breserved_tags; 61 - if (waitqueue_active(&bt->bs[0].wait)) 62 - wake_up(&bt->bs[0].wait); 63 - } 75 + sbitmap_queue_wake_all(&tags->bitmap_tags); 76 + if (include_reserve) 77 + sbitmap_queue_wake_all(&tags->breserved_tags); 64 78 } 65 79 66 80 /* ··· 64 118 * and attempt to provide a fair share of the tag depth for each of them. 65 119 */ 66 120 static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 67 - struct blk_mq_bitmap_tags *bt) 121 + struct sbitmap_queue *bt) 68 122 { 69 123 unsigned int depth, users; 70 124 ··· 76 130 /* 77 131 * Don't try dividing an ant 78 132 */ 79 - if (bt->depth == 1) 133 + if (bt->sb.depth == 1) 80 134 return true; 81 135 82 136 users = atomic_read(&hctx->tags->active_queues); ··· 86 140 /* 87 141 * Allow at least some tags 88 142 */ 89 - depth = max((bt->depth + users - 1) / users, 4U); 143 + depth = max((bt->sb.depth + users - 1) / users, 4U); 90 144 return atomic_read(&hctx->nr_active) < depth; 91 145 } 92 146 93 - static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, 94 - bool nowrap) 147 + static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) 95 148 { 96 - int tag, org_last_tag = last_tag; 97 - 98 - while (1) { 99 - tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); 100 - if (unlikely(tag >= bm->depth)) { 101 - /* 102 - * We started with an offset, and we didn't reset the 103 - * offset to 0 in a failure case, so start from 0 to 104 - * exhaust the map. 105 - */ 106 - if (org_last_tag && last_tag && !nowrap) { 107 - last_tag = org_last_tag = 0; 108 - continue; 109 - } 110 - return -1; 111 - } 112 - 113 - if (!test_and_set_bit(tag, &bm->word)) 114 - break; 115 - 116 - last_tag = tag + 1; 117 - if (last_tag >= bm->depth - 1) 118 - last_tag = 0; 119 - } 120 - 121 - return tag; 122 - } 123 - 124 - #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) 125 - 126 - /* 127 - * Straight forward bitmap tag implementation, where each bit is a tag 128 - * (cleared == free, and set == busy). The small twist is using per-cpu 129 - * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue 130 - * contexts. This enables us to drastically limit the space searched, 131 - * without dirtying an extra shared cacheline like we would if we stored 132 - * the cache value inside the shared blk_mq_bitmap_tags structure. On top 133 - * of that, each word of tags is in a separate cacheline. This means that 134 - * multiple users will tend to stick to different cachelines, at least 135 - * until the map is exhausted. 136 - */ 137 - static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, 138 - unsigned int *tag_cache, struct blk_mq_tags *tags) 139 - { 140 - unsigned int last_tag, org_last_tag; 141 - int index, i, tag; 142 - 143 149 if (!hctx_may_queue(hctx, bt)) 144 150 return -1; 145 - 146 - last_tag = org_last_tag = *tag_cache; 147 - index = TAG_TO_INDEX(bt, last_tag); 148 - 149 - for (i = 0; i < bt->map_nr; i++) { 150 - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), 151 - BT_ALLOC_RR(tags)); 152 - if (tag != -1) { 153 - tag += (index << bt->bits_per_word); 154 - goto done; 155 - } 156 - 157 - /* 158 - * Jump to next index, and reset the last tag to be the 159 - * first tag of that index 160 - */ 161 - index++; 162 - last_tag = (index << bt->bits_per_word); 163 - 164 - if (index >= bt->map_nr) { 165 - index = 0; 166 - last_tag = 0; 167 - } 168 - } 169 - 170 - *tag_cache = 0; 171 - return -1; 172 - 173 - /* 174 - * Only update the cache from the allocation path, if we ended 175 - * up using the specific cached tag. 176 - */ 177 - done: 178 - if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { 179 - last_tag = tag + 1; 180 - if (last_tag >= bt->depth - 1) 181 - last_tag = 0; 182 - 183 - *tag_cache = last_tag; 184 - } 185 - 186 - return tag; 151 + return __sbitmap_queue_get(bt); 187 152 } 188 153 189 - static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, 190 - struct blk_mq_hw_ctx *hctx) 154 + static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, 155 + struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) 191 156 { 192 - struct bt_wait_state *bs; 193 - int wait_index; 194 - 195 - if (!hctx) 196 - return &bt->bs[0]; 197 - 198 - wait_index = atomic_read(&hctx->wait_index); 199 - bs = &bt->bs[wait_index]; 200 - bt_index_atomic_inc(&hctx->wait_index); 201 - return bs; 202 - } 203 - 204 - static int bt_get(struct blk_mq_alloc_data *data, 205 - struct blk_mq_bitmap_tags *bt, 206 - struct blk_mq_hw_ctx *hctx, 207 - unsigned int *last_tag, struct blk_mq_tags *tags) 208 - { 209 - struct bt_wait_state *bs; 157 + struct sbq_wait_state *ws; 210 158 DEFINE_WAIT(wait); 211 159 int tag; 212 160 213 - tag = __bt_get(hctx, bt, last_tag, tags); 161 + tag = __bt_get(hctx, bt); 214 162 if (tag != -1) 215 163 return tag; 216 164 217 165 if (data->flags & BLK_MQ_REQ_NOWAIT) 218 166 return -1; 219 167 220 - bs = bt_wait_ptr(bt, hctx); 168 + ws = bt_wait_ptr(bt, hctx); 221 169 do { 222 - prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 170 + prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); 223 171 224 - tag = __bt_get(hctx, bt, last_tag, tags); 172 + tag = __bt_get(hctx, bt); 225 173 if (tag != -1) 226 174 break; 227 175 ··· 132 292 * Retry tag allocation after running the hardware queue, 133 293 * as running the queue may also have found completions. 134 294 */ 135 - tag = __bt_get(hctx, bt, last_tag, tags); 295 + tag = __bt_get(hctx, bt); 136 296 if (tag != -1) 137 297 break; 138 298 ··· 146 306 if (data->flags & BLK_MQ_REQ_RESERVED) { 147 307 bt = &data->hctx->tags->breserved_tags; 148 308 } else { 149 - last_tag = &data->ctx->last_tag; 150 309 hctx = data->hctx; 151 310 bt = &hctx->tags->bitmap_tags; 152 311 } 153 - finish_wait(&bs->wait, &wait); 154 - bs = bt_wait_ptr(bt, hctx); 312 + finish_wait(&ws->wait, &wait); 313 + ws = bt_wait_ptr(bt, hctx); 155 314 } while (1); 156 315 157 - finish_wait(&bs->wait, &wait); 316 + finish_wait(&ws->wait, &wait); 158 317 return tag; 159 318 } 160 319 ··· 162 323 int tag; 163 324 164 325 tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, 165 - &data->ctx->last_tag, data->hctx->tags); 326 + data->hctx->tags); 166 327 if (tag >= 0) 167 328 return tag + data->hctx->tags->nr_reserved_tags; 168 329 ··· 171 332 172 333 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) 173 334 { 174 - int tag, zero = 0; 335 + int tag; 175 336 176 337 if (unlikely(!data->hctx->tags->nr_reserved_tags)) { 177 338 WARN_ON_ONCE(1); 178 339 return BLK_MQ_TAG_FAIL; 179 340 } 180 341 181 - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, 182 - data->hctx->tags); 342 + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, 343 + data->hctx->tags); 183 344 if (tag < 0) 184 345 return BLK_MQ_TAG_FAIL; 185 346 ··· 193 354 return __blk_mq_get_tag(data); 194 355 } 195 356 196 - static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) 197 - { 198 - int i, wake_index; 199 - 200 - wake_index = atomic_read(&bt->wake_index); 201 - for (i = 0; i < BT_WAIT_QUEUES; i++) { 202 - struct bt_wait_state *bs = &bt->bs[wake_index]; 203 - 204 - if (waitqueue_active(&bs->wait)) { 205 - int o = atomic_read(&bt->wake_index); 206 - if (wake_index != o) 207 - atomic_cmpxchg(&bt->wake_index, o, wake_index); 208 - 209 - return bs; 210 - } 211 - 212 - wake_index = bt_index_inc(wake_index); 213 - } 214 - 215 - return NULL; 216 - } 217 - 218 - static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) 219 - { 220 - const int index = TAG_TO_INDEX(bt, tag); 221 - struct bt_wait_state *bs; 222 - int wait_cnt; 223 - 224 - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); 225 - 226 - /* Ensure that the wait list checks occur after clear_bit(). */ 227 - smp_mb(); 228 - 229 - bs = bt_wake_ptr(bt); 230 - if (!bs) 231 - return; 232 - 233 - wait_cnt = atomic_dec_return(&bs->wait_cnt); 234 - if (unlikely(wait_cnt < 0)) 235 - wait_cnt = atomic_inc_return(&bs->wait_cnt); 236 - if (wait_cnt == 0) { 237 - atomic_add(bt->wake_cnt, &bs->wait_cnt); 238 - bt_index_atomic_inc(&bt->wake_index); 239 - wake_up(&bs->wait); 240 - } 241 - } 242 - 243 - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, 244 - unsigned int *last_tag) 357 + void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 358 + unsigned int tag) 245 359 { 246 360 struct blk_mq_tags *tags = hctx->tags; 247 361 ··· 202 410 const int real_tag = tag - tags->nr_reserved_tags; 203 411 204 412 BUG_ON(real_tag >= tags->nr_tags); 205 - bt_clear_tag(&tags->bitmap_tags, real_tag); 206 - if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) 207 - *last_tag = real_tag; 413 + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); 208 414 } else { 209 415 BUG_ON(tag >= tags->nr_reserved_tags); 210 - bt_clear_tag(&tags->breserved_tags, tag); 416 + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); 211 417 } 212 418 } 213 419 214 - static void bt_for_each(struct blk_mq_hw_ctx *hctx, 215 - struct blk_mq_bitmap_tags *bt, unsigned int off, 216 - busy_iter_fn *fn, void *data, bool reserved) 420 + struct bt_iter_data { 421 + struct blk_mq_hw_ctx *hctx; 422 + busy_iter_fn *fn; 423 + void *data; 424 + bool reserved; 425 + }; 426 + 427 + static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 217 428 { 429 + struct bt_iter_data *iter_data = data; 430 + struct blk_mq_hw_ctx *hctx = iter_data->hctx; 431 + struct blk_mq_tags *tags = hctx->tags; 432 + bool reserved = iter_data->reserved; 218 433 struct request *rq; 219 - int bit, i; 220 434 221 - for (i = 0; i < bt->map_nr; i++) { 222 - struct blk_align_bitmap *bm = &bt->map[i]; 435 + if (!reserved) 436 + bitnr += tags->nr_reserved_tags; 437 + rq = tags->rqs[bitnr]; 223 438 224 - for (bit = find_first_bit(&bm->word, bm->depth); 225 - bit < bm->depth; 226 - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 227 - rq = hctx->tags->rqs[off + bit]; 228 - if (rq->q == hctx->queue) 229 - fn(hctx, rq, data, reserved); 230 - } 231 - 232 - off += (1 << bt->bits_per_word); 233 - } 439 + if (rq->q == hctx->queue) 440 + iter_data->fn(hctx, rq, iter_data->data, reserved); 441 + return true; 234 442 } 235 443 236 - static void bt_tags_for_each(struct blk_mq_tags *tags, 237 - struct blk_mq_bitmap_tags *bt, unsigned int off, 238 - busy_tag_iter_fn *fn, void *data, bool reserved) 444 + static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, 445 + busy_iter_fn *fn, void *data, bool reserved) 239 446 { 447 + struct bt_iter_data iter_data = { 448 + .hctx = hctx, 449 + .fn = fn, 450 + .data = data, 451 + .reserved = reserved, 452 + }; 453 + 454 + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); 455 + } 456 + 457 + struct bt_tags_iter_data { 458 + struct blk_mq_tags *tags; 459 + busy_tag_iter_fn *fn; 460 + void *data; 461 + bool reserved; 462 + }; 463 + 464 + static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 465 + { 466 + struct bt_tags_iter_data *iter_data = data; 467 + struct blk_mq_tags *tags = iter_data->tags; 468 + bool reserved = iter_data->reserved; 240 469 struct request *rq; 241 - int bit, i; 242 470 243 - if (!tags->rqs) 244 - return; 245 - for (i = 0; i < bt->map_nr; i++) { 246 - struct blk_align_bitmap *bm = &bt->map[i]; 471 + if (!reserved) 472 + bitnr += tags->nr_reserved_tags; 473 + rq = tags->rqs[bitnr]; 247 474 248 - for (bit = find_first_bit(&bm->word, bm->depth); 249 - bit < bm->depth; 250 - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 251 - rq = tags->rqs[off + bit]; 252 - fn(rq, data, reserved); 253 - } 475 + iter_data->fn(rq, iter_data->data, reserved); 476 + return true; 477 + } 254 478 255 - off += (1 << bt->bits_per_word); 256 - } 479 + static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, 480 + busy_tag_iter_fn *fn, void *data, bool reserved) 481 + { 482 + struct bt_tags_iter_data iter_data = { 483 + .tags = tags, 484 + .fn = fn, 485 + .data = data, 486 + .reserved = reserved, 487 + }; 488 + 489 + if (tags->rqs) 490 + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); 257 491 } 258 492 259 493 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, 260 494 busy_tag_iter_fn *fn, void *priv) 261 495 { 262 496 if (tags->nr_reserved_tags) 263 - bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); 264 - bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, 265 - false); 497 + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true); 498 + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); 266 499 } 267 500 268 501 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, ··· 346 529 continue; 347 530 348 531 if (tags->nr_reserved_tags) 349 - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); 350 - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, 351 - false); 532 + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); 533 + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); 352 534 } 353 535 354 536 } 355 537 356 - static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) 538 + static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) 357 539 { 358 - unsigned int i, used; 359 - 360 - for (i = 0, used = 0; i < bt->map_nr; i++) { 361 - struct blk_align_bitmap *bm = &bt->map[i]; 362 - 363 - used += bitmap_weight(&bm->word, bm->depth); 364 - } 365 - 366 - return bt->depth - used; 540 + return bt->sb.depth - sbitmap_weight(&bt->sb); 367 541 } 368 542 369 - static void bt_update_count(struct blk_mq_bitmap_tags *bt, 370 - unsigned int depth) 543 + static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, 544 + bool round_robin, int node) 371 545 { 372 - unsigned int tags_per_word = 1U << bt->bits_per_word; 373 - unsigned int map_depth = depth; 374 - 375 - if (depth) { 376 - int i; 377 - 378 - for (i = 0; i < bt->map_nr; i++) { 379 - bt->map[i].depth = min(map_depth, tags_per_word); 380 - map_depth -= bt->map[i].depth; 381 - } 382 - } 383 - 384 - bt->wake_cnt = BT_WAIT_BATCH; 385 - if (bt->wake_cnt > depth / BT_WAIT_QUEUES) 386 - bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); 387 - 388 - bt->depth = depth; 389 - } 390 - 391 - static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, 392 - int node, bool reserved) 393 - { 394 - int i; 395 - 396 - bt->bits_per_word = ilog2(BITS_PER_LONG); 397 - 398 - /* 399 - * Depth can be zero for reserved tags, that's not a failure 400 - * condition. 401 - */ 402 - if (depth) { 403 - unsigned int nr, tags_per_word; 404 - 405 - tags_per_word = (1 << bt->bits_per_word); 406 - 407 - /* 408 - * If the tag space is small, shrink the number of tags 409 - * per word so we spread over a few cachelines, at least. 410 - * If less than 4 tags, just forget about it, it's not 411 - * going to work optimally anyway. 412 - */ 413 - if (depth >= 4) { 414 - while (tags_per_word * 4 > depth) { 415 - bt->bits_per_word--; 416 - tags_per_word = (1 << bt->bits_per_word); 417 - } 418 - } 419 - 420 - nr = ALIGN(depth, tags_per_word) / tags_per_word; 421 - bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), 422 - GFP_KERNEL, node); 423 - if (!bt->map) 424 - return -ENOMEM; 425 - 426 - bt->map_nr = nr; 427 - } 428 - 429 - bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); 430 - if (!bt->bs) { 431 - kfree(bt->map); 432 - bt->map = NULL; 433 - return -ENOMEM; 434 - } 435 - 436 - bt_update_count(bt, depth); 437 - 438 - for (i = 0; i < BT_WAIT_QUEUES; i++) { 439 - init_waitqueue_head(&bt->bs[i].wait); 440 - atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); 441 - } 442 - 443 - return 0; 444 - } 445 - 446 - static void bt_free(struct blk_mq_bitmap_tags *bt) 447 - { 448 - kfree(bt->map); 449 - kfree(bt->bs); 546 + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, 547 + node); 450 548 } 451 549 452 550 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 453 551 int node, int alloc_policy) 454 552 { 455 553 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 554 + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 456 555 457 - tags->alloc_policy = alloc_policy; 458 - 459 - if (bt_alloc(&tags->bitmap_tags, depth, node, false)) 460 - goto enomem; 461 - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) 462 - goto enomem; 556 + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) 557 + goto free_tags; 558 + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, 559 + node)) 560 + goto free_bitmap_tags; 463 561 464 562 return tags; 465 - enomem: 466 - bt_free(&tags->bitmap_tags); 563 + free_bitmap_tags: 564 + sbitmap_queue_free(&tags->bitmap_tags); 565 + free_tags: 467 566 kfree(tags); 468 567 return NULL; 469 568 } ··· 412 679 413 680 void blk_mq_free_tags(struct blk_mq_tags *tags) 414 681 { 415 - bt_free(&tags->bitmap_tags); 416 - bt_free(&tags->breserved_tags); 682 + sbitmap_queue_free(&tags->bitmap_tags); 683 + sbitmap_queue_free(&tags->breserved_tags); 417 684 free_cpumask_var(tags->cpumask); 418 685 kfree(tags); 419 - } 420 - 421 - void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) 422 - { 423 - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 424 - 425 - *tag = prandom_u32() % depth; 426 686 } 427 687 428 688 int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) ··· 428 702 * Don't need (or can't) update reserved tags here, they remain 429 703 * static and should never need resizing. 430 704 */ 431 - bt_update_count(&tags->bitmap_tags, tdepth); 705 + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); 706 + 432 707 blk_mq_tag_wakeup_all(tags, false); 433 708 return 0; 434 709 } ··· 473 746 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " 474 747 "bits_per_word=%u\n", 475 748 tags->nr_tags, tags->nr_reserved_tags, 476 - tags->bitmap_tags.bits_per_word); 749 + 1U << tags->bitmap_tags.sb.shift); 477 750 478 751 free = bt_unused_tags(&tags->bitmap_tags); 479 752 res = bt_unused_tags(&tags->breserved_tags);
+12 -30
block/blk-mq-tag.h
··· 3 3 4 4 #include "blk-mq.h" 5 5 6 - enum { 7 - BT_WAIT_QUEUES = 8, 8 - BT_WAIT_BATCH = 8, 9 - }; 10 - 11 - struct bt_wait_state { 12 - atomic_t wait_cnt; 13 - wait_queue_head_t wait; 14 - } ____cacheline_aligned_in_smp; 15 - 16 - #define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) 17 - #define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) 18 - 19 - struct blk_mq_bitmap_tags { 20 - unsigned int depth; 21 - unsigned int wake_cnt; 22 - unsigned int bits_per_word; 23 - 24 - unsigned int map_nr; 25 - struct blk_align_bitmap *map; 26 - 27 - atomic_t wake_index; 28 - struct bt_wait_state *bs; 29 - }; 30 - 31 6 /* 32 7 * Tag address space map. 33 8 */ ··· 12 37 13 38 atomic_t active_queues; 14 39 15 - struct blk_mq_bitmap_tags bitmap_tags; 16 - struct blk_mq_bitmap_tags breserved_tags; 40 + struct sbitmap_queue bitmap_tags; 41 + struct sbitmap_queue breserved_tags; 17 42 18 43 struct request **rqs; 19 44 struct list_head page_list; 20 45 21 - int alloc_policy; 22 46 cpumask_var_t cpumask; 23 47 }; 24 48 ··· 26 52 extern void blk_mq_free_tags(struct blk_mq_tags *tags); 27 53 28 54 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); 29 - extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); 55 + extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 56 + unsigned int tag); 30 57 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 31 58 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 32 - extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); 33 59 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); 34 60 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); 35 61 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 36 62 void *priv); 63 + 64 + static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, 65 + struct blk_mq_hw_ctx *hctx) 66 + { 67 + if (!hctx) 68 + return &bt->ws[0]; 69 + return sbq_wait_ptr(bt, &hctx->wait_index); 70 + } 37 71 38 72 enum { 39 73 BLK_MQ_TAG_CACHE_MIN = 1,
+63 -120
block/blk-mq.c
··· 22 22 #include <linux/sched/sysctl.h> 23 23 #include <linux/delay.h> 24 24 #include <linux/crash_dump.h> 25 + #include <linux/prefetch.h> 25 26 26 27 #include <trace/events/block.h> 27 28 ··· 34 33 static DEFINE_MUTEX(all_q_mutex); 35 34 static LIST_HEAD(all_q_list); 36 35 37 - static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 38 - 39 36 /* 40 37 * Check if any of the ctx's have pending work in this hardware queue 41 38 */ 42 39 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 43 40 { 44 - unsigned int i; 45 - 46 - for (i = 0; i < hctx->ctx_map.size; i++) 47 - if (hctx->ctx_map.map[i].word) 48 - return true; 49 - 50 - return false; 41 + return sbitmap_any_bit_set(&hctx->ctx_map); 51 42 } 52 - 53 - static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 54 - struct blk_mq_ctx *ctx) 55 - { 56 - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 57 - } 58 - 59 - #define CTX_TO_BIT(hctx, ctx) \ 60 - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 61 43 62 44 /* 63 45 * Mark this ctx as having pending work in this hardware queue ··· 48 64 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 49 65 struct blk_mq_ctx *ctx) 50 66 { 51 - struct blk_align_bitmap *bm = get_bm(hctx, ctx); 52 - 53 - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 54 - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 67 + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 68 + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 55 69 } 56 70 57 71 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 58 72 struct blk_mq_ctx *ctx) 59 73 { 60 - struct blk_align_bitmap *bm = get_bm(hctx, ctx); 61 - 62 - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 74 + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 63 75 } 64 76 65 77 void blk_mq_freeze_queue_start(struct request_queue *q) ··· 226 246 ctx = blk_mq_get_ctx(q); 227 247 hctx = q->mq_ops->map_queue(q, ctx->cpu); 228 248 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 229 - 230 249 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 231 - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { 232 - __blk_mq_run_hw_queue(hctx); 233 - blk_mq_put_ctx(ctx); 234 - 235 - ctx = blk_mq_get_ctx(q); 236 - hctx = q->mq_ops->map_queue(q, ctx->cpu); 237 - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 238 - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 239 - ctx = alloc_data.ctx; 240 - } 241 250 blk_mq_put_ctx(ctx); 251 + 242 252 if (!rq) { 243 253 blk_queue_exit(q); 244 254 return ERR_PTR(-EWOULDBLOCK); ··· 303 333 rq->cmd_flags = 0; 304 334 305 335 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 306 - blk_mq_put_tag(hctx, tag, &ctx->last_tag); 336 + blk_mq_put_tag(hctx, ctx, tag); 307 337 blk_queue_exit(q); 308 338 } 309 339 ··· 483 513 static void blk_mq_requeue_work(struct work_struct *work) 484 514 { 485 515 struct request_queue *q = 486 - container_of(work, struct request_queue, requeue_work); 516 + container_of(work, struct request_queue, requeue_work.work); 487 517 LIST_HEAD(rq_list); 488 518 struct request *rq, *next; 489 519 unsigned long flags; ··· 538 568 539 569 void blk_mq_cancel_requeue_work(struct request_queue *q) 540 570 { 541 - cancel_work_sync(&q->requeue_work); 571 + cancel_delayed_work_sync(&q->requeue_work); 542 572 } 543 573 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 544 574 545 575 void blk_mq_kick_requeue_list(struct request_queue *q) 546 576 { 547 - kblockd_schedule_work(&q->requeue_work); 577 + kblockd_schedule_delayed_work(&q->requeue_work, 0); 548 578 } 549 579 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 580 + 581 + void blk_mq_delay_kick_requeue_list(struct request_queue *q, 582 + unsigned long msecs) 583 + { 584 + kblockd_schedule_delayed_work(&q->requeue_work, 585 + msecs_to_jiffies(msecs)); 586 + } 587 + EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 550 588 551 589 void blk_mq_abort_requeue_list(struct request_queue *q) 552 590 { ··· 578 600 579 601 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 580 602 { 581 - if (tag < tags->nr_tags) 603 + if (tag < tags->nr_tags) { 604 + prefetch(tags->rqs[tag]); 582 605 return tags->rqs[tag]; 606 + } 583 607 584 608 return NULL; 585 609 } ··· 736 756 return false; 737 757 } 738 758 759 + struct flush_busy_ctx_data { 760 + struct blk_mq_hw_ctx *hctx; 761 + struct list_head *list; 762 + }; 763 + 764 + static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 765 + { 766 + struct flush_busy_ctx_data *flush_data = data; 767 + struct blk_mq_hw_ctx *hctx = flush_data->hctx; 768 + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 769 + 770 + sbitmap_clear_bit(sb, bitnr); 771 + spin_lock(&ctx->lock); 772 + list_splice_tail_init(&ctx->rq_list, flush_data->list); 773 + spin_unlock(&ctx->lock); 774 + return true; 775 + } 776 + 739 777 /* 740 778 * Process software queues that have been marked busy, splicing them 741 779 * to the for-dispatch 742 780 */ 743 781 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 744 782 { 745 - struct blk_mq_ctx *ctx; 746 - int i; 783 + struct flush_busy_ctx_data data = { 784 + .hctx = hctx, 785 + .list = list, 786 + }; 747 787 748 - for (i = 0; i < hctx->ctx_map.size; i++) { 749 - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 750 - unsigned int off, bit; 788 + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 789 + } 751 790 752 - if (!bm->word) 753 - continue; 791 + static inline unsigned int queued_to_index(unsigned int queued) 792 + { 793 + if (!queued) 794 + return 0; 754 795 755 - bit = 0; 756 - off = i * hctx->ctx_map.bits_per_word; 757 - do { 758 - bit = find_next_bit(&bm->word, bm->depth, bit); 759 - if (bit >= bm->depth) 760 - break; 761 - 762 - ctx = hctx->ctxs[bit + off]; 763 - clear_bit(bit, &bm->word); 764 - spin_lock(&ctx->lock); 765 - list_splice_tail_init(&ctx->rq_list, list); 766 - spin_unlock(&ctx->lock); 767 - 768 - bit++; 769 - } while (1); 770 - } 796 + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 771 797 } 772 798 773 799 /* ··· 864 878 dptr = &driver_list; 865 879 } 866 880 867 - if (!queued) 868 - hctx->dispatched[0]++; 869 - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 870 - hctx->dispatched[ilog2(queued) + 1]++; 881 + hctx->dispatched[queued_to_index(queued)]++; 871 882 872 883 /* 873 884 * Any items that need requeuing? Stuff them into hctx->dispatch, ··· 920 937 !blk_mq_hw_queue_mapped(hctx))) 921 938 return; 922 939 923 - if (!async) { 940 + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 924 941 int cpu = get_cpu(); 925 942 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 926 943 __blk_mq_run_hw_queue(hctx); ··· 931 948 put_cpu(); 932 949 } 933 950 934 - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 935 - &hctx->run_work, 0); 951 + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); 936 952 } 937 953 938 954 void blk_mq_run_hw_queues(struct request_queue *q, bool async) ··· 952 970 953 971 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 954 972 { 955 - cancel_delayed_work(&hctx->run_work); 973 + cancel_work(&hctx->run_work); 956 974 cancel_delayed_work(&hctx->delay_work); 957 975 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 958 976 } ··· 1005 1023 { 1006 1024 struct blk_mq_hw_ctx *hctx; 1007 1025 1008 - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1026 + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); 1009 1027 1010 1028 __blk_mq_run_hw_queue(hctx); 1011 1029 } ··· 1222 1240 op_flags |= REQ_SYNC; 1223 1241 1224 1242 trace_block_getrq(q, bio, op); 1225 - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); 1243 + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); 1226 1244 rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); 1227 - if (unlikely(!rq)) { 1228 - __blk_mq_run_hw_queue(hctx); 1229 - blk_mq_put_ctx(ctx); 1230 - trace_block_sleeprq(q, bio, op); 1231 - 1232 - ctx = blk_mq_get_ctx(q); 1233 - hctx = q->mq_ops->map_queue(q, ctx->cpu); 1234 - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); 1235 - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); 1236 - ctx = alloc_data.ctx; 1237 - hctx = alloc_data.hctx; 1238 - } 1239 1245 1240 1246 hctx->queued++; 1241 1247 data->hctx = hctx; ··· 1576 1606 return NULL; 1577 1607 } 1578 1608 1579 - static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1580 - { 1581 - kfree(bitmap->map); 1582 - } 1583 - 1584 - static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1585 - { 1586 - unsigned int bpw = 8, total, num_maps, i; 1587 - 1588 - bitmap->bits_per_word = bpw; 1589 - 1590 - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1591 - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1592 - GFP_KERNEL, node); 1593 - if (!bitmap->map) 1594 - return -ENOMEM; 1595 - 1596 - total = nr_cpu_ids; 1597 - for (i = 0; i < num_maps; i++) { 1598 - bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1599 - total -= bitmap->map[i].depth; 1600 - } 1601 - 1602 - return 0; 1603 - } 1604 - 1605 1609 /* 1606 1610 * 'cpu' is going away. splice any existing rq_list entries from this 1607 1611 * software queue to the hw queue dispatch list, and ensure that it ··· 1641 1697 1642 1698 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1643 1699 blk_free_flush_queue(hctx->fq); 1644 - blk_mq_free_bitmap(&hctx->ctx_map); 1700 + sbitmap_free(&hctx->ctx_map); 1645 1701 } 1646 1702 1647 1703 static void blk_mq_exit_hw_queues(struct request_queue *q, ··· 1678 1734 if (node == NUMA_NO_NODE) 1679 1735 node = hctx->numa_node = set->numa_node; 1680 1736 1681 - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1737 + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); 1682 1738 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1683 1739 spin_lock_init(&hctx->lock); 1684 1740 INIT_LIST_HEAD(&hctx->dispatch); ··· 1701 1757 if (!hctx->ctxs) 1702 1758 goto unregister_cpu_notifier; 1703 1759 1704 - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1760 + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, 1761 + node)) 1705 1762 goto free_ctxs; 1706 1763 1707 1764 hctx->nr_ctx = 0; ··· 1729 1784 if (set->ops->exit_hctx) 1730 1785 set->ops->exit_hctx(hctx, hctx_idx); 1731 1786 free_bitmap: 1732 - blk_mq_free_bitmap(&hctx->ctx_map); 1787 + sbitmap_free(&hctx->ctx_map); 1733 1788 free_ctxs: 1734 1789 kfree(hctx->ctxs); 1735 1790 unregister_cpu_notifier: ··· 1805 1860 mutex_unlock(&q->sysfs_lock); 1806 1861 1807 1862 queue_for_each_hw_ctx(q, hctx, i) { 1808 - struct blk_mq_ctxmap *map = &hctx->ctx_map; 1809 - 1810 1863 /* 1811 1864 * If no software queues are mapped to this hardware queue, 1812 1865 * disable it and free the request entries. ··· 1830 1887 * This is more accurate and more efficient than looping 1831 1888 * over all possibly mapped software queues. 1832 1889 */ 1833 - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); 1890 + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 1834 1891 1835 1892 /* 1836 1893 * Initialize batch roundrobin counts ··· 2037 2094 2038 2095 q->sg_reserved_size = INT_MAX; 2039 2096 2040 - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 2097 + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2041 2098 INIT_LIST_HEAD(&q->requeue_list); 2042 2099 spin_lock_init(&q->requeue_lock); 2043 2100
-11
block/blk-mq.h
··· 12 12 unsigned int cpu; 13 13 unsigned int index_hw; 14 14 15 - unsigned int last_tag ____cacheline_aligned_in_smp; 16 - 17 15 /* incremented at dispatch time */ 18 16 unsigned long rq_dispatched[2]; 19 17 unsigned long rq_merged; ··· 60 62 extern void blk_mq_rq_timed_out(struct request *req, bool reserved); 61 63 62 64 void blk_mq_release(struct request_queue *q); 63 - 64 - /* 65 - * Basic implementation of sparser bitmap, allowing the user to spread 66 - * the bits over more cachelines. 67 - */ 68 - struct blk_align_bitmap { 69 - unsigned long word; 70 - unsigned long depth; 71 - } ____cacheline_aligned_in_smp; 72 65 73 66 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 74 67 unsigned int cpu)
+2 -2
block/blk-sysfs.c
··· 704 704 kobject_uevent(&q->kobj, KOBJ_ADD); 705 705 706 706 if (q->mq_ops) 707 - blk_mq_register_disk(disk); 707 + blk_mq_register_dev(dev, q); 708 708 709 709 if (!q->request_fn) 710 710 return 0; ··· 729 729 return; 730 730 731 731 if (q->mq_ops) 732 - blk_mq_unregister_disk(disk); 732 + blk_mq_unregister_dev(disk_to_dev(disk), q); 733 733 734 734 if (q->request_fn) 735 735 elv_unregister_queue(q);
+10 -3
block/cfq-iosched.c
··· 3042 3042 if (ktime_get_ns() < rq->fifo_time) 3043 3043 rq = NULL; 3044 3044 3045 - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 3046 3045 return rq; 3047 3046 } 3048 3047 ··· 3419 3420 { 3420 3421 unsigned int max_dispatch; 3421 3422 3423 + if (cfq_cfqq_must_dispatch(cfqq)) 3424 + return true; 3425 + 3422 3426 /* 3423 3427 * Drain async requests before we start sync IO 3424 3428 */ ··· 3513 3511 3514 3512 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 3515 3513 3514 + rq = cfq_check_fifo(cfqq); 3515 + if (rq) 3516 + cfq_mark_cfqq_must_dispatch(cfqq); 3517 + 3516 3518 if (!cfq_may_dispatch(cfqd, cfqq)) 3517 3519 return false; 3518 3520 3519 3521 /* 3520 3522 * follow expired path, else get first next available 3521 3523 */ 3522 - rq = cfq_check_fifo(cfqq); 3523 3524 if (!rq) 3524 3525 rq = cfqq->next_rq; 3526 + else 3527 + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 3525 3528 3526 3529 /* 3527 3530 * insert request into driver dispatch list ··· 3996 3989 * if the new request is sync, but the currently running queue is 3997 3990 * not, let the sync request have priority. 3998 3991 */ 3999 - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3992 + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 4000 3993 return true; 4001 3994 4002 3995 /*
+1 -1
drivers/block/mtip32xx/mtip32xx.c
··· 3686 3686 return -ENODEV; 3687 3687 } 3688 3688 3689 - void mtip_block_release(struct gendisk *disk, fmode_t mode) 3689 + static void mtip_block_release(struct gendisk *disk, fmode_t mode) 3690 3690 { 3691 3691 } 3692 3692
+161 -250
drivers/block/nbd.c
··· 34 34 #include <linux/kthread.h> 35 35 #include <linux/types.h> 36 36 #include <linux/debugfs.h> 37 + #include <linux/blk-mq.h> 37 38 38 39 #include <asm/uaccess.h> 39 40 #include <asm/types.h> 40 41 41 42 #include <linux/nbd.h> 42 43 44 + #define NBD_TIMEDOUT 0 45 + #define NBD_DISCONNECT_REQUESTED 1 46 + 43 47 struct nbd_device { 44 48 u32 flags; 49 + unsigned long runtime_flags; 45 50 struct socket * sock; /* If == NULL, device is not ready, yet */ 46 51 int magic; 47 52 48 - spinlock_t queue_lock; 49 - struct list_head queue_head; /* Requests waiting result */ 50 - struct request *active_req; 51 - wait_queue_head_t active_wq; 52 - struct list_head waiting_queue; /* Requests to be sent */ 53 - wait_queue_head_t waiting_wq; 53 + struct blk_mq_tag_set tag_set; 54 54 55 55 struct mutex tx_lock; 56 56 struct gendisk *disk; 57 57 int blksize; 58 58 loff_t bytesize; 59 - int xmit_timeout; 60 - bool timedout; 61 - bool disconnect; /* a disconnect has been requested by user */ 62 59 63 - struct timer_list timeout_timer; 64 60 /* protects initialization and shutdown of the socket */ 65 61 spinlock_t sock_lock; 66 62 struct task_struct *task_recv; ··· 65 69 #if IS_ENABLED(CONFIG_DEBUG_FS) 66 70 struct dentry *dbg_dir; 67 71 #endif 72 + }; 73 + 74 + struct nbd_cmd { 75 + struct nbd_device *nbd; 76 + struct list_head list; 68 77 }; 69 78 70 79 #if IS_ENABLED(CONFIG_DEBUG_FS) ··· 83 82 static unsigned int nbds_max = 16; 84 83 static struct nbd_device *nbd_dev; 85 84 static int max_part; 86 - 87 - /* 88 - * Use just one lock (or at most 1 per NIC). Two arguments for this: 89 - * 1. Each NIC is essentially a synchronization point for all servers 90 - * accessed through that NIC so there's no need to have more locks 91 - * than NICs anyway. 92 - * 2. More locks lead to more "Dirty cache line bouncing" which will slow 93 - * down each lock to the point where they're actually slower than just 94 - * a single lock. 95 - * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this! 96 - */ 97 - static DEFINE_SPINLOCK(nbd_lock); 98 85 99 86 static inline struct device *nbd_to_dev(struct nbd_device *nbd) 100 87 { ··· 142 153 return 0; 143 154 } 144 155 145 - static void nbd_end_request(struct nbd_device *nbd, struct request *req) 156 + static void nbd_end_request(struct nbd_cmd *cmd) 146 157 { 158 + struct nbd_device *nbd = cmd->nbd; 159 + struct request *req = blk_mq_rq_from_pdu(cmd); 147 160 int error = req->errors ? -EIO : 0; 148 - struct request_queue *q = req->q; 149 - unsigned long flags; 150 161 151 - dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req, 162 + dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd, 152 163 error ? "failed" : "done"); 153 164 154 - spin_lock_irqsave(q->queue_lock, flags); 155 - __blk_end_request_all(req, error); 156 - spin_unlock_irqrestore(q->queue_lock, flags); 165 + blk_mq_complete_request(req, error); 157 166 } 158 167 159 168 /* ··· 159 172 */ 160 173 static void sock_shutdown(struct nbd_device *nbd) 161 174 { 162 - spin_lock_irq(&nbd->sock_lock); 175 + struct socket *sock; 176 + 177 + spin_lock(&nbd->sock_lock); 163 178 164 179 if (!nbd->sock) { 165 180 spin_unlock_irq(&nbd->sock_lock); 166 181 return; 167 182 } 168 183 184 + sock = nbd->sock; 169 185 dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); 170 - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); 171 - sockfd_put(nbd->sock); 172 186 nbd->sock = NULL; 173 - spin_unlock_irq(&nbd->sock_lock); 187 + spin_unlock(&nbd->sock_lock); 174 188 175 - del_timer(&nbd->timeout_timer); 189 + kernel_sock_shutdown(sock, SHUT_RDWR); 190 + sockfd_put(sock); 176 191 } 177 192 178 - static void nbd_xmit_timeout(unsigned long arg) 193 + static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, 194 + bool reserved) 179 195 { 180 - struct nbd_device *nbd = (struct nbd_device *)arg; 181 - unsigned long flags; 196 + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 197 + struct nbd_device *nbd = cmd->nbd; 198 + struct socket *sock = NULL; 182 199 183 - if (list_empty(&nbd->queue_head)) 184 - return; 200 + spin_lock(&nbd->sock_lock); 185 201 186 - spin_lock_irqsave(&nbd->sock_lock, flags); 202 + set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); 187 203 188 - nbd->timedout = true; 204 + if (nbd->sock) { 205 + sock = nbd->sock; 206 + get_file(sock->file); 207 + } 189 208 190 - if (nbd->sock) 191 - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); 209 + spin_unlock(&nbd->sock_lock); 210 + if (sock) { 211 + kernel_sock_shutdown(sock, SHUT_RDWR); 212 + sockfd_put(sock); 213 + } 192 214 193 - spin_unlock_irqrestore(&nbd->sock_lock, flags); 194 - 215 + req->errors++; 195 216 dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); 217 + return BLK_EH_HANDLED; 196 218 } 197 219 198 220 /* ··· 251 255 252 256 tsk_restore_flags(current, pflags, PF_MEMALLOC); 253 257 254 - if (!send && nbd->xmit_timeout) 255 - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); 256 - 257 258 return result; 258 259 } 259 260 ··· 266 273 } 267 274 268 275 /* always call with the tx_lock held */ 269 - static int nbd_send_req(struct nbd_device *nbd, struct request *req) 276 + static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) 270 277 { 278 + struct request *req = blk_mq_rq_from_pdu(cmd); 271 279 int result, flags; 272 280 struct nbd_request request; 273 281 unsigned long size = blk_rq_bytes(req); ··· 292 298 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); 293 299 request.len = htonl(size); 294 300 } 295 - memcpy(request.handle, &req, sizeof(req)); 301 + memcpy(request.handle, &req->tag, sizeof(req->tag)); 296 302 297 303 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", 298 - req, nbdcmd_to_ascii(type), 304 + cmd, nbdcmd_to_ascii(type), 299 305 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); 300 306 result = sock_xmit(nbd, 1, &request, sizeof(request), 301 307 (type == NBD_CMD_WRITE) ? MSG_MORE : 0); ··· 317 323 if (!rq_iter_last(bvec, iter)) 318 324 flags = MSG_MORE; 319 325 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", 320 - req, bvec.bv_len); 326 + cmd, bvec.bv_len); 321 327 result = sock_send_bvec(nbd, &bvec, flags); 322 328 if (result <= 0) { 323 329 dev_err(disk_to_dev(nbd->disk), ··· 328 334 } 329 335 } 330 336 return 0; 331 - } 332 - 333 - static struct request *nbd_find_request(struct nbd_device *nbd, 334 - struct request *xreq) 335 - { 336 - struct request *req, *tmp; 337 - int err; 338 - 339 - err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); 340 - if (unlikely(err)) 341 - return ERR_PTR(err); 342 - 343 - spin_lock(&nbd->queue_lock); 344 - list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { 345 - if (req != xreq) 346 - continue; 347 - list_del_init(&req->queuelist); 348 - spin_unlock(&nbd->queue_lock); 349 - return req; 350 - } 351 - spin_unlock(&nbd->queue_lock); 352 - 353 - return ERR_PTR(-ENOENT); 354 337 } 355 338 356 339 static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) ··· 341 370 } 342 371 343 372 /* NULL returned = something went wrong, inform userspace */ 344 - static struct request *nbd_read_stat(struct nbd_device *nbd) 373 + static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) 345 374 { 346 375 int result; 347 376 struct nbd_reply reply; 348 - struct request *req; 377 + struct nbd_cmd *cmd; 378 + struct request *req = NULL; 379 + u16 hwq; 380 + int tag; 349 381 350 382 reply.magic = 0; 351 383 result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); ··· 364 390 return ERR_PTR(-EPROTO); 365 391 } 366 392 367 - req = nbd_find_request(nbd, *(struct request **)reply.handle); 368 - if (IS_ERR(req)) { 369 - result = PTR_ERR(req); 370 - if (result != -ENOENT) 371 - return ERR_PTR(result); 393 + memcpy(&tag, reply.handle, sizeof(int)); 372 394 373 - dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", 374 - reply.handle); 375 - return ERR_PTR(-EBADR); 395 + hwq = blk_mq_unique_tag_to_hwq(tag); 396 + if (hwq < nbd->tag_set.nr_hw_queues) 397 + req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], 398 + blk_mq_unique_tag_to_tag(tag)); 399 + if (!req || !blk_mq_request_started(req)) { 400 + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", 401 + tag, req); 402 + return ERR_PTR(-ENOENT); 376 403 } 404 + cmd = blk_mq_rq_to_pdu(req); 377 405 378 406 if (ntohl(reply.error)) { 379 407 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", 380 408 ntohl(reply.error)); 381 409 req->errors++; 382 - return req; 410 + return cmd; 383 411 } 384 412 385 - dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); 413 + dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd); 386 414 if (rq_data_dir(req) != WRITE) { 387 415 struct req_iterator iter; 388 416 struct bio_vec bvec; ··· 395 419 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", 396 420 result); 397 421 req->errors++; 398 - return req; 422 + return cmd; 399 423 } 400 424 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", 401 - req, bvec.bv_len); 425 + cmd, bvec.bv_len); 402 426 } 403 427 } 404 - return req; 428 + return cmd; 405 429 } 406 430 407 431 static ssize_t pid_show(struct device *dev, ··· 420 444 421 445 static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) 422 446 { 423 - struct request *req; 447 + struct nbd_cmd *cmd; 424 448 int ret; 425 449 426 450 BUG_ON(nbd->magic != NBD_MAGIC); ··· 436 460 nbd_size_update(nbd, bdev); 437 461 438 462 while (1) { 439 - req = nbd_read_stat(nbd); 440 - if (IS_ERR(req)) { 441 - ret = PTR_ERR(req); 463 + cmd = nbd_read_stat(nbd); 464 + if (IS_ERR(cmd)) { 465 + ret = PTR_ERR(cmd); 442 466 break; 443 467 } 444 468 445 - nbd_end_request(nbd, req); 469 + nbd_end_request(cmd); 446 470 } 447 471 448 472 nbd_size_clear(nbd, bdev); ··· 451 475 return ret; 452 476 } 453 477 478 + static void nbd_clear_req(struct request *req, void *data, bool reserved) 479 + { 480 + struct nbd_cmd *cmd; 481 + 482 + if (!blk_mq_request_started(req)) 483 + return; 484 + cmd = blk_mq_rq_to_pdu(req); 485 + req->errors++; 486 + nbd_end_request(cmd); 487 + } 488 + 454 489 static void nbd_clear_que(struct nbd_device *nbd) 455 490 { 456 - struct request *req; 457 - 458 491 BUG_ON(nbd->magic != NBD_MAGIC); 459 492 460 493 /* 461 494 * Because we have set nbd->sock to NULL under the tx_lock, all 462 - * modifications to the list must have completed by now. For 463 - * the same reason, the active_req must be NULL. 464 - * 465 - * As a consequence, we don't need to take the spin lock while 466 - * purging the list here. 495 + * modifications to the list must have completed by now. 467 496 */ 468 497 BUG_ON(nbd->sock); 469 - BUG_ON(nbd->active_req); 470 498 471 - while (!list_empty(&nbd->queue_head)) { 472 - req = list_entry(nbd->queue_head.next, struct request, 473 - queuelist); 474 - list_del_init(&req->queuelist); 475 - req->errors++; 476 - nbd_end_request(nbd, req); 477 - } 478 - 479 - while (!list_empty(&nbd->waiting_queue)) { 480 - req = list_entry(nbd->waiting_queue.next, struct request, 481 - queuelist); 482 - list_del_init(&req->queuelist); 483 - req->errors++; 484 - nbd_end_request(nbd, req); 485 - } 499 + blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); 486 500 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); 487 501 } 488 502 489 503 490 - static void nbd_handle_req(struct nbd_device *nbd, struct request *req) 504 + static void nbd_handle_cmd(struct nbd_cmd *cmd) 491 505 { 506 + struct request *req = blk_mq_rq_from_pdu(cmd); 507 + struct nbd_device *nbd = cmd->nbd; 508 + 492 509 if (req->cmd_type != REQ_TYPE_FS) 493 510 goto error_out; 494 511 ··· 495 526 req->errors = 0; 496 527 497 528 mutex_lock(&nbd->tx_lock); 529 + nbd->task_send = current; 498 530 if (unlikely(!nbd->sock)) { 499 531 mutex_unlock(&nbd->tx_lock); 500 532 dev_err(disk_to_dev(nbd->disk), ··· 503 533 goto error_out; 504 534 } 505 535 506 - nbd->active_req = req; 507 - 508 - if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head)) 509 - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); 510 - 511 - if (nbd_send_req(nbd, req) != 0) { 536 + if (nbd_send_cmd(nbd, cmd) != 0) { 512 537 dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); 513 538 req->errors++; 514 - nbd_end_request(nbd, req); 515 - } else { 516 - spin_lock(&nbd->queue_lock); 517 - list_add_tail(&req->queuelist, &nbd->queue_head); 518 - spin_unlock(&nbd->queue_lock); 539 + nbd_end_request(cmd); 519 540 } 520 541 521 - nbd->active_req = NULL; 542 + nbd->task_send = NULL; 522 543 mutex_unlock(&nbd->tx_lock); 523 - wake_up_all(&nbd->active_wq); 524 544 525 545 return; 526 546 527 547 error_out: 528 548 req->errors++; 529 - nbd_end_request(nbd, req); 549 + nbd_end_request(cmd); 530 550 } 531 551 532 - static int nbd_thread_send(void *data) 552 + static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, 553 + const struct blk_mq_queue_data *bd) 533 554 { 534 - struct nbd_device *nbd = data; 535 - struct request *req; 555 + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 536 556 537 - nbd->task_send = current; 538 - 539 - set_user_nice(current, MIN_NICE); 540 - while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { 541 - /* wait for something to do */ 542 - wait_event_interruptible(nbd->waiting_wq, 543 - kthread_should_stop() || 544 - !list_empty(&nbd->waiting_queue)); 545 - 546 - /* extract request */ 547 - if (list_empty(&nbd->waiting_queue)) 548 - continue; 549 - 550 - spin_lock_irq(&nbd->queue_lock); 551 - req = list_entry(nbd->waiting_queue.next, struct request, 552 - queuelist); 553 - list_del_init(&req->queuelist); 554 - spin_unlock_irq(&nbd->queue_lock); 555 - 556 - /* handle request */ 557 - nbd_handle_req(nbd, req); 558 - } 559 - 560 - nbd->task_send = NULL; 561 - 562 - return 0; 563 - } 564 - 565 - /* 566 - * We always wait for result of write, for now. It would be nice to make it optional 567 - * in future 568 - * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) 569 - * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } 570 - */ 571 - 572 - static void nbd_request_handler(struct request_queue *q) 573 - __releases(q->queue_lock) __acquires(q->queue_lock) 574 - { 575 - struct request *req; 576 - 577 - while ((req = blk_fetch_request(q)) != NULL) { 578 - struct nbd_device *nbd; 579 - 580 - spin_unlock_irq(q->queue_lock); 581 - 582 - nbd = req->rq_disk->private_data; 583 - 584 - BUG_ON(nbd->magic != NBD_MAGIC); 585 - 586 - dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n", 587 - req, req->cmd_type); 588 - 589 - if (unlikely(!nbd->sock)) { 590 - dev_err_ratelimited(disk_to_dev(nbd->disk), 591 - "Attempted send on closed socket\n"); 592 - req->errors++; 593 - nbd_end_request(nbd, req); 594 - spin_lock_irq(q->queue_lock); 595 - continue; 596 - } 597 - 598 - spin_lock_irq(&nbd->queue_lock); 599 - list_add_tail(&req->queuelist, &nbd->waiting_queue); 600 - spin_unlock_irq(&nbd->queue_lock); 601 - 602 - wake_up(&nbd->waiting_wq); 603 - 604 - spin_lock_irq(q->queue_lock); 605 - } 557 + blk_mq_start_request(bd->rq); 558 + nbd_handle_cmd(cmd); 559 + return BLK_MQ_RQ_QUEUE_OK; 606 560 } 607 561 608 562 static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) ··· 551 657 /* Reset all properties of an NBD device */ 552 658 static void nbd_reset(struct nbd_device *nbd) 553 659 { 554 - nbd->disconnect = false; 555 - nbd->timedout = false; 660 + nbd->runtime_flags = 0; 556 661 nbd->blksize = 1024; 557 662 nbd->bytesize = 0; 558 663 set_capacity(nbd->disk, 0); 559 664 nbd->flags = 0; 560 - nbd->xmit_timeout = 0; 665 + nbd->tag_set.timeout = 0; 561 666 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); 562 - del_timer_sync(&nbd->timeout_timer); 563 667 } 564 668 565 669 static void nbd_bdev_reset(struct block_device *bdev) ··· 592 700 { 593 701 switch (cmd) { 594 702 case NBD_DISCONNECT: { 595 - struct request sreq; 703 + struct request *sreq; 596 704 597 705 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); 598 706 if (!nbd->sock) 599 707 return -EINVAL; 600 708 709 + sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0); 710 + if (!sreq) 711 + return -ENOMEM; 712 + 601 713 mutex_unlock(&nbd->tx_lock); 602 714 fsync_bdev(bdev); 603 715 mutex_lock(&nbd->tx_lock); 604 - blk_rq_init(NULL, &sreq); 605 - sreq.cmd_type = REQ_TYPE_DRV_PRIV; 716 + sreq->cmd_type = REQ_TYPE_DRV_PRIV; 606 717 607 718 /* Check again after getting mutex back. */ 608 - if (!nbd->sock) 719 + if (!nbd->sock) { 720 + blk_mq_free_request(sreq); 609 721 return -EINVAL; 722 + } 610 723 611 - nbd->disconnect = true; 724 + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags); 612 725 613 - nbd_send_req(nbd, &sreq); 726 + nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq)); 727 + blk_mq_free_request(sreq); 614 728 return 0; 615 729 } 616 730 617 731 case NBD_CLEAR_SOCK: 618 732 sock_shutdown(nbd); 619 733 nbd_clear_que(nbd); 620 - BUG_ON(!list_empty(&nbd->queue_head)); 621 - BUG_ON(!list_empty(&nbd->waiting_queue)); 622 734 kill_bdev(bdev); 623 735 return 0; 624 736 ··· 654 758 return nbd_size_set(nbd, bdev, nbd->blksize, arg); 655 759 656 760 case NBD_SET_TIMEOUT: 657 - nbd->xmit_timeout = arg * HZ; 658 - if (arg) 659 - mod_timer(&nbd->timeout_timer, 660 - jiffies + nbd->xmit_timeout); 661 - else 662 - del_timer_sync(&nbd->timeout_timer); 663 - 761 + nbd->tag_set.timeout = arg * HZ; 664 762 return 0; 665 763 666 764 case NBD_SET_FLAGS: ··· 662 772 return 0; 663 773 664 774 case NBD_DO_IT: { 665 - struct task_struct *thread; 666 775 int error; 667 776 668 777 if (nbd->task_recv) ··· 675 786 676 787 nbd_parse_flags(nbd, bdev); 677 788 678 - thread = kthread_run(nbd_thread_send, nbd, "%s", 679 - nbd_name(nbd)); 680 - if (IS_ERR(thread)) { 681 - mutex_lock(&nbd->tx_lock); 682 - nbd->task_recv = NULL; 683 - return PTR_ERR(thread); 684 - } 685 - 686 789 nbd_dev_dbg_init(nbd); 687 790 error = nbd_thread_recv(nbd, bdev); 688 791 nbd_dev_dbg_close(nbd); 689 - kthread_stop(thread); 690 792 691 793 mutex_lock(&nbd->tx_lock); 692 794 nbd->task_recv = NULL; ··· 687 807 kill_bdev(bdev); 688 808 nbd_bdev_reset(bdev); 689 809 690 - if (nbd->disconnect) /* user requested, ignore socket errors */ 810 + /* user requested, ignore socket errors */ 811 + if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) 691 812 error = 0; 692 - if (nbd->timedout) 813 + if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags)) 693 814 error = -ETIMEDOUT; 694 815 695 816 nbd_reset(nbd); ··· 706 825 return 0; 707 826 708 827 case NBD_PRINT_DEBUG: 709 - dev_info(disk_to_dev(nbd->disk), 710 - "next = %p, prev = %p, head = %p\n", 711 - nbd->queue_head.next, nbd->queue_head.prev, 712 - &nbd->queue_head); 828 + /* 829 + * For compatibility only, we no longer keep a list of 830 + * outstanding requests. 831 + */ 713 832 return 0; 714 833 } 715 834 return -ENOTTY; ··· 816 935 817 936 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); 818 937 debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); 819 - debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); 938 + debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); 820 939 debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); 821 940 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); 822 941 ··· 868 987 869 988 #endif 870 989 990 + static int nbd_init_request(void *data, struct request *rq, 991 + unsigned int hctx_idx, unsigned int request_idx, 992 + unsigned int numa_node) 993 + { 994 + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); 995 + 996 + cmd->nbd = data; 997 + INIT_LIST_HEAD(&cmd->list); 998 + return 0; 999 + } 1000 + 1001 + static struct blk_mq_ops nbd_mq_ops = { 1002 + .queue_rq = nbd_queue_rq, 1003 + .map_queue = blk_mq_map_queue, 1004 + .init_request = nbd_init_request, 1005 + .timeout = nbd_xmit_timeout, 1006 + }; 1007 + 871 1008 /* 872 1009 * And here should be modules and kernel interface 873 1010 * (Just smiley confuses emacs :-) ··· 934 1035 if (!disk) 935 1036 goto out; 936 1037 nbd_dev[i].disk = disk; 1038 + 1039 + nbd_dev[i].tag_set.ops = &nbd_mq_ops; 1040 + nbd_dev[i].tag_set.nr_hw_queues = 1; 1041 + nbd_dev[i].tag_set.queue_depth = 128; 1042 + nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE; 1043 + nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd); 1044 + nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE | 1045 + BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; 1046 + nbd_dev[i].tag_set.driver_data = &nbd_dev[i]; 1047 + 1048 + err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set); 1049 + if (err) { 1050 + put_disk(disk); 1051 + goto out; 1052 + } 1053 + 937 1054 /* 938 1055 * The new linux 2.5 block layer implementation requires 939 1056 * every gendisk to have its very own request_queue struct. 940 1057 * These structs are big so we dynamically allocate them. 941 1058 */ 942 - disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock); 1059 + disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set); 943 1060 if (!disk->queue) { 1061 + blk_mq_free_tag_set(&nbd_dev[i].tag_set); 944 1062 put_disk(disk); 945 1063 goto out; 946 1064 } 1065 + 947 1066 /* 948 1067 * Tell the block layer that we are not a rotational device 949 1068 */ ··· 986 1069 for (i = 0; i < nbds_max; i++) { 987 1070 struct gendisk *disk = nbd_dev[i].disk; 988 1071 nbd_dev[i].magic = NBD_MAGIC; 989 - INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); 990 - spin_lock_init(&nbd_dev[i].queue_lock); 991 1072 spin_lock_init(&nbd_dev[i].sock_lock); 992 - INIT_LIST_HEAD(&nbd_dev[i].queue_head); 993 1073 mutex_init(&nbd_dev[i].tx_lock); 994 - init_timer(&nbd_dev[i].timeout_timer); 995 - nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; 996 - nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; 997 - init_waitqueue_head(&nbd_dev[i].active_wq); 998 - init_waitqueue_head(&nbd_dev[i].waiting_wq); 999 1074 disk->major = NBD_MAJOR; 1000 1075 disk->first_minor = i << part_shift; 1001 1076 disk->fops = &nbd_fops; ··· 1000 1091 return 0; 1001 1092 out: 1002 1093 while (i--) { 1094 + blk_mq_free_tag_set(&nbd_dev[i].tag_set); 1003 1095 blk_cleanup_queue(nbd_dev[i].disk->queue); 1004 1096 put_disk(nbd_dev[i].disk); 1005 1097 } ··· 1020 1110 if (disk) { 1021 1111 del_gendisk(disk); 1022 1112 blk_cleanup_queue(disk->queue); 1113 + blk_mq_free_tag_set(&nbd_dev[i].tag_set); 1023 1114 put_disk(disk); 1024 1115 } 1025 1116 }
+79 -49
drivers/block/null_blk.c
··· 34 34 unsigned int index; 35 35 struct request_queue *q; 36 36 struct gendisk *disk; 37 + struct nvm_dev *ndev; 37 38 struct blk_mq_tag_set tag_set; 38 39 struct hrtimer timer; 39 40 unsigned int queue_depth; ··· 415 414 kfree(nullb->queues); 416 415 } 417 416 418 - static void null_del_dev(struct nullb *nullb) 419 - { 420 - list_del_init(&nullb->list); 421 - 422 - if (use_lightnvm) 423 - nvm_unregister(nullb->disk_name); 424 - else 425 - del_gendisk(nullb->disk); 426 - blk_cleanup_queue(nullb->q); 427 - if (queue_mode == NULL_Q_MQ) 428 - blk_mq_free_tag_set(&nullb->tag_set); 429 - if (!use_lightnvm) 430 - put_disk(nullb->disk); 431 - cleanup_queues(nullb); 432 - kfree(nullb); 433 - } 434 - 435 417 #ifdef CONFIG_NVM 436 418 437 419 static void null_lnvm_end_io(struct request *rq, int error) ··· 548 564 /* Simulate nvme protocol restriction */ 549 565 .max_phys_sect = 64, 550 566 }; 567 + 568 + static int null_nvm_register(struct nullb *nullb) 569 + { 570 + struct nvm_dev *dev; 571 + int rv; 572 + 573 + dev = nvm_alloc_dev(0); 574 + if (!dev) 575 + return -ENOMEM; 576 + 577 + dev->q = nullb->q; 578 + memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); 579 + dev->ops = &null_lnvm_dev_ops; 580 + 581 + rv = nvm_register(dev); 582 + if (rv) { 583 + kfree(dev); 584 + return rv; 585 + } 586 + nullb->ndev = dev; 587 + return 0; 588 + } 589 + 590 + static void null_nvm_unregister(struct nullb *nullb) 591 + { 592 + nvm_unregister(nullb->ndev); 593 + } 551 594 #else 552 - static struct nvm_dev_ops null_lnvm_dev_ops; 595 + static int null_nvm_register(struct nullb *nullb) 596 + { 597 + return -EINVAL; 598 + } 599 + static void null_nvm_unregister(struct nullb *nullb) {} 553 600 #endif /* CONFIG_NVM */ 601 + 602 + static void null_del_dev(struct nullb *nullb) 603 + { 604 + list_del_init(&nullb->list); 605 + 606 + if (use_lightnvm) 607 + null_nvm_unregister(nullb); 608 + else 609 + del_gendisk(nullb->disk); 610 + blk_cleanup_queue(nullb->q); 611 + if (queue_mode == NULL_Q_MQ) 612 + blk_mq_free_tag_set(&nullb->tag_set); 613 + if (!use_lightnvm) 614 + put_disk(nullb->disk); 615 + cleanup_queues(nullb); 616 + kfree(nullb); 617 + } 554 618 555 619 static int null_open(struct block_device *bdev, fmode_t mode) 556 620 { ··· 672 640 return 0; 673 641 } 674 642 675 - static int null_add_dev(void) 643 + static int null_gendisk_register(struct nullb *nullb) 676 644 { 677 645 struct gendisk *disk; 678 - struct nullb *nullb; 679 646 sector_t size; 647 + 648 + disk = nullb->disk = alloc_disk_node(1, home_node); 649 + if (!disk) 650 + return -ENOMEM; 651 + size = gb * 1024 * 1024 * 1024ULL; 652 + set_capacity(disk, size >> 9); 653 + 654 + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 655 + disk->major = null_major; 656 + disk->first_minor = nullb->index; 657 + disk->fops = &null_fops; 658 + disk->private_data = nullb; 659 + disk->queue = nullb->q; 660 + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 661 + 662 + add_disk(disk); 663 + return 0; 664 + } 665 + 666 + static int null_add_dev(void) 667 + { 668 + struct nullb *nullb; 680 669 int rv; 681 670 682 671 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); ··· 769 716 770 717 sprintf(nullb->disk_name, "nullb%d", nullb->index); 771 718 772 - if (use_lightnvm) { 773 - rv = nvm_register(nullb->q, nullb->disk_name, 774 - &null_lnvm_dev_ops); 775 - if (rv) 776 - goto out_cleanup_blk_queue; 777 - goto done; 778 - } 719 + if (use_lightnvm) 720 + rv = null_nvm_register(nullb); 721 + else 722 + rv = null_gendisk_register(nullb); 779 723 780 - disk = nullb->disk = alloc_disk_node(1, home_node); 781 - if (!disk) { 782 - rv = -ENOMEM; 783 - goto out_cleanup_lightnvm; 784 - } 785 - size = gb * 1024 * 1024 * 1024ULL; 786 - set_capacity(disk, size >> 9); 724 + if (rv) 725 + goto out_cleanup_blk_queue; 787 726 788 - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 789 - disk->major = null_major; 790 - disk->first_minor = nullb->index; 791 - disk->fops = &null_fops; 792 - disk->private_data = nullb; 793 - disk->queue = nullb->q; 794 - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 795 - 796 - add_disk(disk); 797 - 798 - done: 799 727 mutex_lock(&lock); 800 728 list_add_tail(&nullb->list, &nullb_list); 801 729 mutex_unlock(&lock); 802 730 803 731 return 0; 804 - 805 - out_cleanup_lightnvm: 806 - if (use_lightnvm) 807 - nvm_unregister(nullb->disk_name); 808 732 out_cleanup_blk_queue: 809 733 blk_cleanup_queue(nullb->q); 810 734 out_cleanup_tags:
+1 -1
drivers/lightnvm/Kconfig
··· 4 4 5 5 menuconfig NVM 6 6 bool "Open-Channel SSD target support" 7 - depends on BLOCK 7 + depends on BLOCK && HAS_DMA 8 8 help 9 9 Say Y here to get to enable Open-channel SSDs. 10 10
+1 -1
drivers/lightnvm/Makefile
··· 2 2 # Makefile for Open-Channel SSDs. 3 3 # 4 4 5 - obj-$(CONFIG_NVM) := core.o sysblk.o 5 + obj-$(CONFIG_NVM) := core.o sysblk.o sysfs.o 6 6 obj-$(CONFIG_NVM_GENNVM) += gennvm.o 7 7 obj-$(CONFIG_NVM_RRPC) += rrpc.o
+24 -33
drivers/lightnvm/core.c
··· 27 27 #include <linux/lightnvm.h> 28 28 #include <linux/sched/sysctl.h> 29 29 30 + #include "lightnvm.h" 31 + 30 32 static LIST_HEAD(nvm_tgt_types); 31 33 static DECLARE_RWSEM(nvm_tgtt_lock); 32 34 static LIST_HEAD(nvm_mgrs); ··· 583 581 mutex_init(&dev->mlock); 584 582 spin_lock_init(&dev->lock); 585 583 584 + blk_queue_logical_block_size(dev->q, dev->sec_size); 585 + 586 586 return 0; 587 587 err_fmtype: 588 588 kfree(dev->lun_map); ··· 600 596 dev->mt = NULL; 601 597 } 602 598 603 - static void nvm_free(struct nvm_dev *dev) 599 + void nvm_free(struct nvm_dev *dev) 604 600 { 605 601 if (!dev) 606 602 return; 607 603 608 604 nvm_free_mgr(dev); 609 605 606 + if (dev->dma_pool) 607 + dev->ops->destroy_dma_pool(dev->dma_pool); 608 + 610 609 kfree(dev->lptbl); 611 610 kfree(dev->lun_map); 611 + kfree(dev); 612 612 } 613 613 614 614 static int nvm_init(struct nvm_dev *dev) ··· 659 651 660 652 static void nvm_exit(struct nvm_dev *dev) 661 653 { 662 - if (dev->dma_pool) 663 - dev->ops->destroy_dma_pool(dev->dma_pool); 664 - nvm_free(dev); 665 - 666 - pr_info("nvm: successfully unloaded\n"); 654 + nvm_sysfs_unregister_dev(dev); 667 655 } 668 656 669 - int nvm_register(struct request_queue *q, char *disk_name, 670 - struct nvm_dev_ops *ops) 657 + struct nvm_dev *nvm_alloc_dev(int node) 671 658 { 672 - struct nvm_dev *dev; 659 + return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); 660 + } 661 + EXPORT_SYMBOL(nvm_alloc_dev); 662 + 663 + int nvm_register(struct nvm_dev *dev) 664 + { 673 665 int ret; 674 - 675 - if (!ops->identity) 676 - return -EINVAL; 677 - 678 - dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL); 679 - if (!dev) 680 - return -ENOMEM; 681 - 682 - dev->q = q; 683 - dev->ops = ops; 684 - strncpy(dev->name, disk_name, DISK_NAME_LEN); 685 666 686 667 ret = nvm_init(dev); 687 668 if (ret) ··· 691 694 } 692 695 } 693 696 697 + ret = nvm_sysfs_register_dev(dev); 698 + if (ret) 699 + goto err_ppalist; 700 + 694 701 if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { 695 702 ret = nvm_get_sysblock(dev, &dev->sb); 696 703 if (!ret) ··· 711 710 up_write(&nvm_lock); 712 711 713 712 return 0; 713 + err_ppalist: 714 + dev->ops->destroy_dma_pool(dev->dma_pool); 714 715 err_init: 715 716 kfree(dev->lun_map); 716 - kfree(dev); 717 717 return ret; 718 718 } 719 719 EXPORT_SYMBOL(nvm_register); 720 720 721 - void nvm_unregister(char *disk_name) 721 + void nvm_unregister(struct nvm_dev *dev) 722 722 { 723 - struct nvm_dev *dev; 724 - 725 723 down_write(&nvm_lock); 726 - dev = nvm_find_nvm_dev(disk_name); 727 - if (!dev) { 728 - pr_err("nvm: could not find device %s to unregister\n", 729 - disk_name); 730 - up_write(&nvm_lock); 731 - return; 732 - } 733 - 734 724 list_del(&dev->devices); 735 725 up_write(&nvm_lock); 736 726 737 727 nvm_exit(dev); 738 - kfree(dev); 739 728 } 740 729 EXPORT_SYMBOL(nvm_unregister); 741 730
+35
drivers/lightnvm/lightnvm.h
··· 1 + /* 2 + * Copyright (C) 2016 CNEX Labs. All rights reserved. 3 + * Initial release: Matias Bjorling <matias@cnexlabs.com> 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public License version 7 + * 2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, but 10 + * WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public License 15 + * along with this program; see the file COPYING. If not, write to 16 + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, 17 + * USA. 18 + * 19 + */ 20 + 21 + #ifndef LIGHTNVM_H 22 + #define LIGHTNVM_H 23 + 24 + #include <linux/lightnvm.h> 25 + 26 + /* core -> sysfs.c */ 27 + int __must_check nvm_sysfs_register_dev(struct nvm_dev *); 28 + void nvm_sysfs_unregister_dev(struct nvm_dev *); 29 + int nvm_sysfs_register(void); 30 + void nvm_sysfs_unregister(void); 31 + 32 + /* sysfs > core */ 33 + void nvm_free(struct nvm_dev *); 34 + 35 + #endif
+198
drivers/lightnvm/sysfs.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/lightnvm.h> 3 + #include <linux/miscdevice.h> 4 + #include <linux/kobject.h> 5 + #include <linux/blk-mq.h> 6 + 7 + #include "lightnvm.h" 8 + 9 + static ssize_t nvm_dev_attr_show(struct device *dev, 10 + struct device_attribute *dattr, char *page) 11 + { 12 + struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev); 13 + struct nvm_id *id = &ndev->identity; 14 + struct nvm_id_group *grp = &id->groups[0]; 15 + struct attribute *attr = &dattr->attr; 16 + 17 + if (strcmp(attr->name, "version") == 0) { 18 + return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id); 19 + } else if (strcmp(attr->name, "vendor_opcode") == 0) { 20 + return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt); 21 + } else if (strcmp(attr->name, "capabilities") == 0) { 22 + return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); 23 + } else if (strcmp(attr->name, "device_mode") == 0) { 24 + return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); 25 + } else if (strcmp(attr->name, "media_manager") == 0) { 26 + if (!ndev->mt) 27 + return scnprintf(page, PAGE_SIZE, "%s\n", "none"); 28 + return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); 29 + } else if (strcmp(attr->name, "ppa_format") == 0) { 30 + return scnprintf(page, PAGE_SIZE, 31 + "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", 32 + id->ppaf.ch_offset, id->ppaf.ch_len, 33 + id->ppaf.lun_offset, id->ppaf.lun_len, 34 + id->ppaf.pln_offset, id->ppaf.pln_len, 35 + id->ppaf.blk_offset, id->ppaf.blk_len, 36 + id->ppaf.pg_offset, id->ppaf.pg_len, 37 + id->ppaf.sect_offset, id->ppaf.sect_len); 38 + } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ 39 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype); 40 + } else if (strcmp(attr->name, "flash_media_type") == 0) { 41 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype); 42 + } else if (strcmp(attr->name, "num_channels") == 0) { 43 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch); 44 + } else if (strcmp(attr->name, "num_luns") == 0) { 45 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun); 46 + } else if (strcmp(attr->name, "num_planes") == 0) { 47 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); 48 + } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ 49 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); 50 + } else if (strcmp(attr->name, "num_pages") == 0) { 51 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); 52 + } else if (strcmp(attr->name, "page_size") == 0) { 53 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz); 54 + } else if (strcmp(attr->name, "hw_sector_size") == 0) { 55 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs); 56 + } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ 57 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos); 58 + } else if (strcmp(attr->name, "read_typ") == 0) { 59 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt); 60 + } else if (strcmp(attr->name, "read_max") == 0) { 61 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm); 62 + } else if (strcmp(attr->name, "prog_typ") == 0) { 63 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt); 64 + } else if (strcmp(attr->name, "prog_max") == 0) { 65 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm); 66 + } else if (strcmp(attr->name, "erase_typ") == 0) { 67 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet); 68 + } else if (strcmp(attr->name, "erase_max") == 0) { 69 + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem); 70 + } else if (strcmp(attr->name, "multiplane_modes") == 0) { 71 + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos); 72 + } else if (strcmp(attr->name, "media_capabilities") == 0) { 73 + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap); 74 + } else if (strcmp(attr->name, "max_phys_secs") == 0) { 75 + return scnprintf(page, PAGE_SIZE, "%u\n", 76 + ndev->ops->max_phys_sect); 77 + } else { 78 + return scnprintf(page, 79 + PAGE_SIZE, 80 + "Unhandled attr(%s) in `nvm_dev_attr_show`\n", 81 + attr->name); 82 + } 83 + } 84 + 85 + #define NVM_DEV_ATTR_RO(_name) \ 86 + DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) 87 + 88 + static NVM_DEV_ATTR_RO(version); 89 + static NVM_DEV_ATTR_RO(vendor_opcode); 90 + static NVM_DEV_ATTR_RO(capabilities); 91 + static NVM_DEV_ATTR_RO(device_mode); 92 + static NVM_DEV_ATTR_RO(ppa_format); 93 + static NVM_DEV_ATTR_RO(media_manager); 94 + 95 + static NVM_DEV_ATTR_RO(media_type); 96 + static NVM_DEV_ATTR_RO(flash_media_type); 97 + static NVM_DEV_ATTR_RO(num_channels); 98 + static NVM_DEV_ATTR_RO(num_luns); 99 + static NVM_DEV_ATTR_RO(num_planes); 100 + static NVM_DEV_ATTR_RO(num_blocks); 101 + static NVM_DEV_ATTR_RO(num_pages); 102 + static NVM_DEV_ATTR_RO(page_size); 103 + static NVM_DEV_ATTR_RO(hw_sector_size); 104 + static NVM_DEV_ATTR_RO(oob_sector_size); 105 + static NVM_DEV_ATTR_RO(read_typ); 106 + static NVM_DEV_ATTR_RO(read_max); 107 + static NVM_DEV_ATTR_RO(prog_typ); 108 + static NVM_DEV_ATTR_RO(prog_max); 109 + static NVM_DEV_ATTR_RO(erase_typ); 110 + static NVM_DEV_ATTR_RO(erase_max); 111 + static NVM_DEV_ATTR_RO(multiplane_modes); 112 + static NVM_DEV_ATTR_RO(media_capabilities); 113 + static NVM_DEV_ATTR_RO(max_phys_secs); 114 + 115 + #define NVM_DEV_ATTR(_name) (dev_attr_##_name##) 116 + 117 + static struct attribute *nvm_dev_attrs[] = { 118 + &dev_attr_version.attr, 119 + &dev_attr_vendor_opcode.attr, 120 + &dev_attr_capabilities.attr, 121 + &dev_attr_device_mode.attr, 122 + &dev_attr_media_manager.attr, 123 + 124 + &dev_attr_ppa_format.attr, 125 + &dev_attr_media_type.attr, 126 + &dev_attr_flash_media_type.attr, 127 + &dev_attr_num_channels.attr, 128 + &dev_attr_num_luns.attr, 129 + &dev_attr_num_planes.attr, 130 + &dev_attr_num_blocks.attr, 131 + &dev_attr_num_pages.attr, 132 + &dev_attr_page_size.attr, 133 + &dev_attr_hw_sector_size.attr, 134 + &dev_attr_oob_sector_size.attr, 135 + &dev_attr_read_typ.attr, 136 + &dev_attr_read_max.attr, 137 + &dev_attr_prog_typ.attr, 138 + &dev_attr_prog_max.attr, 139 + &dev_attr_erase_typ.attr, 140 + &dev_attr_erase_max.attr, 141 + &dev_attr_multiplane_modes.attr, 142 + &dev_attr_media_capabilities.attr, 143 + &dev_attr_max_phys_secs.attr, 144 + NULL, 145 + }; 146 + 147 + static struct attribute_group nvm_dev_attr_group = { 148 + .name = "lightnvm", 149 + .attrs = nvm_dev_attrs, 150 + }; 151 + 152 + static const struct attribute_group *nvm_dev_attr_groups[] = { 153 + &nvm_dev_attr_group, 154 + NULL, 155 + }; 156 + 157 + static void nvm_dev_release(struct device *device) 158 + { 159 + struct nvm_dev *dev = container_of(device, struct nvm_dev, dev); 160 + struct request_queue *q = dev->q; 161 + 162 + pr_debug("nvm/sysfs: `nvm_dev_release`\n"); 163 + 164 + blk_mq_unregister_dev(device, q); 165 + 166 + nvm_free(dev); 167 + } 168 + 169 + static struct device_type nvm_type = { 170 + .name = "lightnvm", 171 + .groups = nvm_dev_attr_groups, 172 + .release = nvm_dev_release, 173 + }; 174 + 175 + int nvm_sysfs_register_dev(struct nvm_dev *dev) 176 + { 177 + int ret; 178 + 179 + if (!dev->parent_dev) 180 + return 0; 181 + 182 + dev->dev.parent = dev->parent_dev; 183 + dev_set_name(&dev->dev, "%s", dev->name); 184 + dev->dev.type = &nvm_type; 185 + device_initialize(&dev->dev); 186 + ret = device_add(&dev->dev); 187 + 188 + if (!ret) 189 + blk_mq_register_dev(&dev->dev, dev->q); 190 + 191 + return ret; 192 + } 193 + 194 + void nvm_sysfs_unregister_dev(struct nvm_dev *dev) 195 + { 196 + if (dev && dev->parent_dev) 197 + kobject_put(&dev->dev.kobj); 198 + }
+1 -5
drivers/md/bcache/btree.c
··· 361 361 static void btree_node_write_done(struct closure *cl) 362 362 { 363 363 struct btree *b = container_of(cl, struct btree, io); 364 - struct bio_vec *bv; 365 - int n; 366 364 367 - bio_for_each_segment_all(bv, b->bio, n) 368 - __free_page(bv->bv_page); 369 - 365 + bio_free_pages(b->bio); 370 366 __btree_node_write_done(cl); 371 367 } 372 368
+2 -4
drivers/md/bcache/debug.c
··· 107 107 { 108 108 char name[BDEVNAME_SIZE]; 109 109 struct bio *check; 110 - struct bio_vec bv, *bv2; 110 + struct bio_vec bv; 111 111 struct bvec_iter iter; 112 - int i; 113 112 114 113 check = bio_clone(bio, GFP_NOIO); 115 114 if (!check) ··· 135 136 kunmap_atomic(p1); 136 137 } 137 138 138 - bio_for_each_segment_all(bv2, check, i) 139 - __free_page(bv2->bv_page); 139 + bio_free_pages(check); 140 140 out_put: 141 141 bio_put(check); 142 142 }
+1 -4
drivers/md/bcache/movinggc.c
··· 44 44 { 45 45 struct moving_io *io = container_of(cl, struct moving_io, cl); 46 46 struct bio *bio = &io->bio.bio; 47 - struct bio_vec *bv; 48 - int i; 49 47 50 - bio_for_each_segment_all(bv, bio, i) 51 - __free_page(bv->bv_page); 48 + bio_free_pages(bio); 52 49 53 50 if (io->op.replace_collision) 54 51 trace_bcache_gc_copy_collision(&io->w->key);
+2 -7
drivers/md/bcache/request.c
··· 694 694 if (s->iop.replace_collision) 695 695 bch_mark_cache_miss_collision(s->iop.c, s->d); 696 696 697 - if (s->iop.bio) { 698 - int i; 699 - struct bio_vec *bv; 700 - 701 - bio_for_each_segment_all(bv, s->iop.bio, i) 702 - __free_page(bv->bv_page); 703 - } 697 + if (s->iop.bio) 698 + bio_free_pages(s->iop.bio); 704 699 705 700 cached_dev_bio_complete(cl); 706 701 }
+1 -4
drivers/md/bcache/writeback.c
··· 128 128 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 129 129 struct keybuf_key *w = io->bio.bi_private; 130 130 struct cached_dev *dc = io->dc; 131 - struct bio_vec *bv; 132 - int i; 133 131 134 - bio_for_each_segment_all(bv, &io->bio, i) 135 - __free_page(bv->bv_page); 132 + bio_free_pages(&io->bio); 136 133 137 134 /* This is kind of a dumb way of signalling errors. */ 138 135 if (KEY_DIRTY(&w->key)) {
+1 -1
drivers/md/dm-crypt.c
··· 1136 1136 clone->bi_private = io; 1137 1137 clone->bi_end_io = crypt_endio; 1138 1138 clone->bi_bdev = cc->dev->bdev; 1139 - bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf); 1139 + bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); 1140 1140 } 1141 1141 1142 1142 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
+1 -5
drivers/md/dm-log-writes.c
··· 149 149 static void log_end_io(struct bio *bio) 150 150 { 151 151 struct log_writes_c *lc = bio->bi_private; 152 - struct bio_vec *bvec; 153 - int i; 154 152 155 153 if (bio->bi_error) { 156 154 unsigned long flags; ··· 159 161 spin_unlock_irqrestore(&lc->blocks_lock, flags); 160 162 } 161 163 162 - bio_for_each_segment_all(bvec, bio, i) 163 - __free_page(bvec->bv_page); 164 - 164 + bio_free_pages(bio); 165 165 put_io_block(lc); 166 166 bio_put(bio); 167 167 }
+1 -1
drivers/md/dm-rq.c
··· 955 955 dm_init_md_queue(md); 956 956 957 957 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 958 - blk_mq_register_disk(md->disk); 958 + blk_mq_register_dev(disk_to_dev(md->disk), q); 959 959 960 960 return 0; 961 961
+2 -6
drivers/md/raid1.c
··· 145 145 return r1_bio; 146 146 147 147 out_free_pages: 148 - while (--j >= 0) { 149 - struct bio_vec *bv; 150 - 151 - bio_for_each_segment_all(bv, r1_bio->bios[j], i) 152 - __free_page(bv->bv_page); 153 - } 148 + while (--j >= 0) 149 + bio_free_pages(r1_bio->bios[j]); 154 150 155 151 out_free_bio: 156 152 while (++j < pi->raid_disks)
+90 -65
drivers/nvme/host/core.c
··· 156 156 { 157 157 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 158 158 159 - if (ns->type == NVME_NS_LIGHTNVM) 160 - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); 159 + if (ns->ndev) 160 + nvme_nvm_unregister(ns); 161 161 162 - spin_lock(&dev_list_lock); 163 - ns->disk->private_data = NULL; 164 - spin_unlock(&dev_list_lock); 162 + if (ns->disk) { 163 + spin_lock(&dev_list_lock); 164 + ns->disk->private_data = NULL; 165 + spin_unlock(&dev_list_lock); 166 + } 165 167 166 168 put_disk(ns->disk); 167 169 ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); ··· 599 597 } 600 598 601 599 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, 602 - dma_addr_t dma_addr, u32 *result) 600 + void *buffer, size_t buflen, u32 *result) 603 601 { 604 602 struct nvme_command c; 605 603 struct nvme_completion cqe; ··· 608 606 memset(&c, 0, sizeof(c)); 609 607 c.features.opcode = nvme_admin_get_features; 610 608 c.features.nsid = cpu_to_le32(nsid); 611 - c.features.dptr.prp1 = cpu_to_le64(dma_addr); 612 609 c.features.fid = cpu_to_le32(fid); 613 610 614 - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, 611 + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0, 615 612 NVME_QID_ANY, 0, 0); 616 613 if (ret >= 0 && result) 617 614 *result = le32_to_cpu(cqe.result); ··· 618 617 } 619 618 620 619 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 621 - dma_addr_t dma_addr, u32 *result) 620 + void *buffer, size_t buflen, u32 *result) 622 621 { 623 622 struct nvme_command c; 624 623 struct nvme_completion cqe; ··· 626 625 627 626 memset(&c, 0, sizeof(c)); 628 627 c.features.opcode = nvme_admin_set_features; 629 - c.features.dptr.prp1 = cpu_to_le64(dma_addr); 630 628 c.features.fid = cpu_to_le32(fid); 631 629 c.features.dword11 = cpu_to_le32(dword11); 632 630 633 - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, 634 - NVME_QID_ANY, 0, 0); 631 + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, 632 + buffer, buflen, 0, NVME_QID_ANY, 0, 0); 635 633 if (ret >= 0 && result) 636 634 *result = le32_to_cpu(cqe.result); 637 635 return ret; ··· 664 664 u32 result; 665 665 int status, nr_io_queues; 666 666 667 - status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, 667 + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 668 668 &result); 669 669 if (status < 0) 670 670 return status; ··· 888 888 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 889 889 } 890 890 891 - static int nvme_revalidate_disk(struct gendisk *disk) 891 + static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) 892 892 { 893 - struct nvme_ns *ns = disk->private_data; 894 - struct nvme_id_ns *id; 895 - u8 lbaf, pi_type; 896 - u16 old_ms; 897 - unsigned short bs; 898 - 899 - if (test_bit(NVME_NS_DEAD, &ns->flags)) { 900 - set_capacity(disk, 0); 901 - return -ENODEV; 902 - } 903 - if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { 904 - dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", 905 - __func__); 906 - return -ENODEV; 907 - } 908 - if (id->ncap == 0) { 909 - kfree(id); 893 + if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { 894 + dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); 910 895 return -ENODEV; 911 896 } 912 897 913 - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { 914 - if (nvme_nvm_register(ns->queue, disk->disk_name)) { 915 - dev_warn(disk_to_dev(ns->disk), 916 - "%s: LightNVM init failure\n", __func__); 917 - kfree(id); 918 - return -ENODEV; 919 - } 920 - ns->type = NVME_NS_LIGHTNVM; 898 + if ((*id)->ncap == 0) { 899 + kfree(*id); 900 + return -ENODEV; 921 901 } 922 902 923 903 if (ns->ctrl->vs >= NVME_VS(1, 1)) 924 - memcpy(ns->eui, id->eui64, sizeof(ns->eui)); 904 + memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); 925 905 if (ns->ctrl->vs >= NVME_VS(1, 2)) 926 - memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); 906 + memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); 907 + 908 + return 0; 909 + } 910 + 911 + static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 912 + { 913 + struct nvme_ns *ns = disk->private_data; 914 + u8 lbaf, pi_type; 915 + u16 old_ms; 916 + unsigned short bs; 927 917 928 918 old_ms = ns->ms; 929 919 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ··· 952 962 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) 953 963 nvme_config_discard(ns); 954 964 blk_mq_unfreeze_queue(disk->queue); 965 + } 955 966 967 + static int nvme_revalidate_disk(struct gendisk *disk) 968 + { 969 + struct nvme_ns *ns = disk->private_data; 970 + struct nvme_id_ns *id = NULL; 971 + int ret; 972 + 973 + if (test_bit(NVME_NS_DEAD, &ns->flags)) { 974 + set_capacity(disk, 0); 975 + return -ENODEV; 976 + } 977 + 978 + ret = nvme_revalidate_ns(ns, &id); 979 + if (ret) 980 + return ret; 981 + 982 + __nvme_revalidate_disk(disk, id); 956 983 kfree(id); 984 + 957 985 return 0; 958 986 } 959 987 ··· 1433 1425 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 1434 1426 char *buf) 1435 1427 { 1436 - struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1428 + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1437 1429 struct nvme_ctrl *ctrl = ns->ctrl; 1438 1430 int serial_len = sizeof(ctrl->serial); 1439 1431 int model_len = sizeof(ctrl->model); ··· 1457 1449 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 1458 1450 char *buf) 1459 1451 { 1460 - struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1452 + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1461 1453 return sprintf(buf, "%pU\n", ns->uuid); 1462 1454 } 1463 1455 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); ··· 1465 1457 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 1466 1458 char *buf) 1467 1459 { 1468 - struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1460 + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1469 1461 return sprintf(buf, "%8phd\n", ns->eui); 1470 1462 } 1471 1463 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); ··· 1473 1465 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 1474 1466 char *buf) 1475 1467 { 1476 - struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1468 + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1477 1469 return sprintf(buf, "%d\n", ns->ns_id); 1478 1470 } 1479 1471 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); ··· 1490 1482 struct attribute *a, int n) 1491 1483 { 1492 1484 struct device *dev = container_of(kobj, struct device, kobj); 1493 - struct nvme_ns *ns = dev_to_disk(dev)->private_data; 1485 + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 1494 1486 1495 1487 if (a == &dev_attr_uuid.attr) { 1496 1488 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) ··· 1650 1642 { 1651 1643 struct nvme_ns *ns; 1652 1644 struct gendisk *disk; 1645 + struct nvme_id_ns *id; 1646 + char disk_name[DISK_NAME_LEN]; 1653 1647 int node = dev_to_node(ctrl->dev); 1654 1648 1655 1649 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); ··· 1669 1659 ns->queue->queuedata = ns; 1670 1660 ns->ctrl = ctrl; 1671 1661 1672 - disk = alloc_disk_node(0, node); 1673 - if (!disk) 1674 - goto out_free_queue; 1675 - 1676 1662 kref_init(&ns->kref); 1677 1663 ns->ns_id = nsid; 1678 - ns->disk = disk; 1679 1664 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1680 - 1681 1665 1682 1666 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1683 1667 nvme_set_queue_limits(ctrl, ns->queue); 1684 1668 1685 - disk->fops = &nvme_fops; 1686 - disk->private_data = ns; 1687 - disk->queue = ns->queue; 1688 - disk->flags = GENHD_FL_EXT_DEVT; 1689 - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1669 + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1690 1670 1691 - if (nvme_revalidate_disk(ns->disk)) 1692 - goto out_free_disk; 1671 + if (nvme_revalidate_ns(ns, &id)) 1672 + goto out_free_queue; 1673 + 1674 + if (nvme_nvm_ns_supported(ns, id)) { 1675 + if (nvme_nvm_register(ns, disk_name, node, 1676 + &nvme_ns_attr_group)) { 1677 + dev_warn(ctrl->dev, "%s: LightNVM init failure\n", 1678 + __func__); 1679 + goto out_free_id; 1680 + } 1681 + } else { 1682 + disk = alloc_disk_node(0, node); 1683 + if (!disk) 1684 + goto out_free_id; 1685 + 1686 + disk->fops = &nvme_fops; 1687 + disk->private_data = ns; 1688 + disk->queue = ns->queue; 1689 + disk->flags = GENHD_FL_EXT_DEVT; 1690 + memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 1691 + ns->disk = disk; 1692 + 1693 + __nvme_revalidate_disk(disk, id); 1694 + } 1693 1695 1694 1696 mutex_lock(&ctrl->namespaces_mutex); 1695 1697 list_add_tail(&ns->list, &ctrl->namespaces); 1696 1698 mutex_unlock(&ctrl->namespaces_mutex); 1697 1699 1698 1700 kref_get(&ctrl->kref); 1699 - if (ns->type == NVME_NS_LIGHTNVM) 1701 + 1702 + kfree(id); 1703 + 1704 + if (ns->ndev) 1700 1705 return; 1701 1706 1702 1707 device_add_disk(ctrl->device, ns->disk); ··· 1720 1695 pr_warn("%s: failed to create sysfs group for identification\n", 1721 1696 ns->disk->disk_name); 1722 1697 return; 1723 - out_free_disk: 1724 - kfree(disk); 1698 + out_free_id: 1699 + kfree(id); 1725 1700 out_free_queue: 1726 1701 blk_cleanup_queue(ns->queue); 1727 1702 out_release_instance: ··· 1735 1710 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 1736 1711 return; 1737 1712 1738 - if (ns->disk->flags & GENHD_FL_UP) { 1713 + if (ns->disk && ns->disk->flags & GENHD_FL_UP) { 1739 1714 if (blk_get_integrity(ns->disk)) 1740 1715 blk_integrity_unregister(ns->disk); 1741 1716 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, ··· 1758 1733 1759 1734 ns = nvme_find_get_ns(ctrl, nsid); 1760 1735 if (ns) { 1761 - if (revalidate_disk(ns->disk)) 1736 + if (ns->disk && revalidate_disk(ns->disk)) 1762 1737 nvme_ns_remove(ns); 1763 1738 nvme_put_ns(ns); 1764 1739 } else ··· 2063 2038 * Revalidating a dead namespace sets capacity to 0. This will 2064 2039 * end buffered writers dirtying pages that can't be synced. 2065 2040 */ 2066 - if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 2041 + if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 2067 2042 revalidate_disk(ns->disk); 2068 2043 2069 2044 blk_set_queue_dying(ns->queue);
+23 -2
drivers/nvme/host/fabrics.c
··· 111 111 */ 112 112 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) 113 113 { 114 - return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", 115 - ctrl->opts->traddr, ctrl->opts->trsvcid); 114 + int len = 0; 115 + 116 + if (ctrl->opts->mask & NVMF_OPT_TRADDR) 117 + len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); 118 + if (ctrl->opts->mask & NVMF_OPT_TRSVCID) 119 + len += snprintf(buf + len, size - len, "%strsvcid=%s", 120 + (len) ? "," : "", ctrl->opts->trsvcid); 121 + if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) 122 + len += snprintf(buf + len, size - len, "%shost_traddr=%s", 123 + (len) ? "," : "", ctrl->opts->host_traddr); 124 + len += snprintf(buf + len, size - len, "\n"); 125 + 126 + return len; 116 127 } 117 128 EXPORT_SYMBOL_GPL(nvmf_get_address); 118 129 ··· 530 519 { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, 531 520 { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, 532 521 { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, 522 + { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, 533 523 { NVMF_OPT_ERR, NULL } 534 524 }; 535 525 ··· 687 675 } 688 676 opts->reconnect_delay = token; 689 677 break; 678 + case NVMF_OPT_HOST_TRADDR: 679 + p = match_strdup(args); 680 + if (!p) { 681 + ret = -ENOMEM; 682 + goto out; 683 + } 684 + opts->host_traddr = p; 685 + break; 690 686 default: 691 687 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", 692 688 p); ··· 761 741 kfree(opts->traddr); 762 742 kfree(opts->trsvcid); 763 743 kfree(opts->subsysnqn); 744 + kfree(opts->host_traddr); 764 745 kfree(opts); 765 746 } 766 747 EXPORT_SYMBOL_GPL(nvmf_free_options);
+8 -3
drivers/nvme/host/fabrics.h
··· 52 52 NVMF_OPT_KATO = 1 << 7, 53 53 NVMF_OPT_HOSTNQN = 1 << 8, 54 54 NVMF_OPT_RECONNECT_DELAY = 1 << 9, 55 + NVMF_OPT_HOST_TRADDR = 1 << 10, 55 56 }; 56 57 57 58 /** ··· 65 64 * being added. 66 65 * @subsysnqn: Hold the fully qualified NQN subystem name (format defined 67 66 * in the NVMe specification, "NVMe Qualified Names"). 68 - * @traddr: network address that will be used by the host to communicate 69 - * to the added NVMe controller. 70 - * @trsvcid: network port used for host-controller communication. 67 + * @traddr: The transport-specific TRADDR field for a port on the 68 + * subsystem which is adding a controller. 69 + * @trsvcid: The transport-specific TRSVCID field for a port on the 70 + * subsystem which is adding a controller. 71 + * @host_traddr: A transport-specific field identifying the NVME host port 72 + * to use for the connection to the controller. 71 73 * @queue_size: Number of IO queue elements. 72 74 * @nr_io_queues: Number of controller IO queues that will be established. 73 75 * @reconnect_delay: Time between two consecutive reconnect attempts. ··· 84 80 char *subsysnqn; 85 81 char *traddr; 86 82 char *trsvcid; 83 + char *host_traddr; 87 84 size_t queue_size; 88 85 unsigned int nr_io_queues; 89 86 unsigned int reconnect_delay;
+28 -5
drivers/nvme/host/lightnvm.c
··· 475 475 476 476 if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) 477 477 c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, 478 - rqd->bio->bi_iter.bi_sector)); 478 + rqd->bio->bi_iter.bi_sector)); 479 479 } 480 480 481 481 static void nvme_nvm_end_io(struct request *rq, int error) ··· 592 592 .max_phys_sect = 64, 593 593 }; 594 594 595 - int nvme_nvm_register(struct request_queue *q, char *disk_name) 595 + int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, 596 + const struct attribute_group *attrs) 596 597 { 597 - return nvm_register(q, disk_name, &nvme_nvm_dev_ops); 598 + struct request_queue *q = ns->queue; 599 + struct nvm_dev *dev; 600 + int ret; 601 + 602 + dev = nvm_alloc_dev(node); 603 + if (!dev) 604 + return -ENOMEM; 605 + 606 + dev->q = q; 607 + memcpy(dev->name, disk_name, DISK_NAME_LEN); 608 + dev->ops = &nvme_nvm_dev_ops; 609 + dev->parent_dev = ns->ctrl->device; 610 + dev->private_data = ns; 611 + ns->ndev = dev; 612 + 613 + ret = nvm_register(dev); 614 + 615 + ns->lba_shift = ilog2(dev->sec_size) - 9; 616 + 617 + if (sysfs_create_group(&dev->dev.kobj, attrs)) 618 + pr_warn("%s: failed to create sysfs group for identification\n", 619 + disk_name); 620 + return ret; 598 621 } 599 622 600 - void nvme_nvm_unregister(struct request_queue *q, char *disk_name) 623 + void nvme_nvm_unregister(struct nvme_ns *ns) 601 624 { 602 - nvm_unregister(disk_name); 625 + nvm_unregister(ns->ndev); 603 626 } 604 627 605 628 /* move to shared place when used in multiple places. */
+23 -7
drivers/nvme/host/nvme.h
··· 18 18 #include <linux/pci.h> 19 19 #include <linux/kref.h> 20 20 #include <linux/blk-mq.h> 21 + #include <linux/lightnvm.h> 21 22 22 23 enum { 23 24 /* ··· 155 154 struct nvme_ctrl *ctrl; 156 155 struct request_queue *queue; 157 156 struct gendisk *disk; 157 + struct nvm_dev *ndev; 158 158 struct kref kref; 159 159 int instance; 160 160 ··· 167 165 u16 ms; 168 166 bool ext; 169 167 u8 pi_type; 170 - int type; 171 168 unsigned long flags; 172 169 173 170 #define NVME_NS_REMOVING 0 ··· 293 292 struct nvme_id_ns **id); 294 293 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); 295 294 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, 296 - dma_addr_t dma_addr, u32 *result); 295 + void *buffer, size_t buflen, u32 *result); 297 296 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, 298 - dma_addr_t dma_addr, u32 *result); 297 + void *buffer, size_t buflen, u32 *result); 299 298 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); 300 299 void nvme_start_keep_alive(struct nvme_ctrl *ctrl); 301 300 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); ··· 308 307 309 308 #ifdef CONFIG_NVM 310 309 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); 311 - int nvme_nvm_register(struct request_queue *q, char *disk_name); 312 - void nvme_nvm_unregister(struct request_queue *q, char *disk_name); 310 + int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, 311 + const struct attribute_group *attrs); 312 + void nvme_nvm_unregister(struct nvme_ns *ns); 313 + 314 + static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) 315 + { 316 + if (dev->type->devnode) 317 + return dev_to_disk(dev)->private_data; 318 + 319 + return (container_of(dev, struct nvm_dev, dev))->private_data; 320 + } 313 321 #else 314 - static inline int nvme_nvm_register(struct request_queue *q, char *disk_name) 322 + static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, 323 + int node, 324 + const struct attribute_group *attrs) 315 325 { 316 326 return 0; 317 327 } 318 328 319 - static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; 329 + static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; 320 330 321 331 static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) 322 332 { 323 333 return 0; 334 + } 335 + static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) 336 + { 337 + return dev_to_disk(dev)->private_data; 324 338 } 325 339 #endif /* CONFIG_NVM */ 326 340
+6 -74
drivers/nvme/host/scsi.c
··· 72 72 #define ALL_LUNS_RETURNED 0x02 73 73 #define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 74 74 #define RESTRICTED_LUNS_RETURNED 0x00 75 - #define NVME_POWER_STATE_START_VALID 0x00 76 - #define NVME_POWER_STATE_ACTIVE 0x01 77 - #define NVME_POWER_STATE_IDLE 0x02 78 - #define NVME_POWER_STATE_STANDBY 0x03 79 - #define NVME_POWER_STATE_LU_CONTROL 0x07 80 - #define POWER_STATE_0 0 81 - #define POWER_STATE_1 1 82 - #define POWER_STATE_2 2 83 - #define POWER_STATE_3 3 84 75 #define DOWNLOAD_SAVE_ACTIVATE 0x05 85 76 #define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E 86 77 #define ACTIVATE_DEFERRED_MICROCODE 0x0F ··· 906 915 kfree(smart_log); 907 916 908 917 /* Get Features for Temp Threshold */ 909 - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, 918 + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, 910 919 &feature_resp); 911 920 if (res != NVME_SC_SUCCESS) 912 921 temp_c_thresh = LOG_TEMP_UNKNOWN; ··· 1039 1048 if (len < MODE_PAGE_CACHING_LEN) 1040 1049 return -EINVAL; 1041 1050 1042 - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, 1051 + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, 1043 1052 &feature_resp); 1044 1053 res = nvme_trans_status_code(hdr, nvme_sc); 1045 1054 if (res) ··· 1220 1229 1221 1230 /* Start Stop Unit Helper Functions */ 1222 1231 1223 - static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, 1224 - u8 pc, u8 pcmod, u8 start) 1225 - { 1226 - int res; 1227 - int nvme_sc; 1228 - struct nvme_id_ctrl *id_ctrl; 1229 - int lowest_pow_st; /* max npss = lowest power consumption */ 1230 - unsigned ps_desired = 0; 1231 - 1232 - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); 1233 - res = nvme_trans_status_code(hdr, nvme_sc); 1234 - if (res) 1235 - return res; 1236 - 1237 - lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); 1238 - kfree(id_ctrl); 1239 - 1240 - switch (pc) { 1241 - case NVME_POWER_STATE_START_VALID: 1242 - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ 1243 - if (pcmod == 0 && start == 0x1) 1244 - ps_desired = POWER_STATE_0; 1245 - if (pcmod == 0 && start == 0x0) 1246 - ps_desired = lowest_pow_st; 1247 - break; 1248 - case NVME_POWER_STATE_ACTIVE: 1249 - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ 1250 - if (pcmod == 0) 1251 - ps_desired = POWER_STATE_0; 1252 - break; 1253 - case NVME_POWER_STATE_IDLE: 1254 - /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ 1255 - if (pcmod == 0x0) 1256 - ps_desired = POWER_STATE_1; 1257 - else if (pcmod == 0x1) 1258 - ps_desired = POWER_STATE_2; 1259 - else if (pcmod == 0x2) 1260 - ps_desired = POWER_STATE_3; 1261 - break; 1262 - case NVME_POWER_STATE_STANDBY: 1263 - /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ 1264 - if (pcmod == 0x0) 1265 - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); 1266 - else if (pcmod == 0x1) 1267 - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); 1268 - break; 1269 - case NVME_POWER_STATE_LU_CONTROL: 1270 - default: 1271 - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, 1272 - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, 1273 - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); 1274 - break; 1275 - } 1276 - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, 1277 - NULL); 1278 - return nvme_trans_status_code(hdr, nvme_sc); 1279 - } 1280 - 1281 1232 static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, 1282 1233 u8 buffer_id) 1283 1234 { ··· 1328 1395 case MODE_PAGE_CACHING: 1329 1396 dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); 1330 1397 nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 1331 - dword11, 0, NULL); 1398 + dword11, NULL, 0, NULL); 1332 1399 res = nvme_trans_status_code(hdr, nvme_sc); 1333 1400 break; 1334 1401 case MODE_PAGE_CONTROL: ··· 2168 2235 static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, 2169 2236 u8 *cmd) 2170 2237 { 2171 - u8 immed, pcmod, pc, no_flush, start; 2238 + u8 immed, pcmod, no_flush, start; 2172 2239 2173 2240 immed = cmd[1] & 0x01; 2174 2241 pcmod = cmd[3] & 0x0f; 2175 - pc = (cmd[4] & 0xf0) >> 4; 2176 2242 no_flush = cmd[4] & 0x04; 2177 2243 start = cmd[4] & 0x01; 2178 2244 ··· 2186 2254 if (res) 2187 2255 return res; 2188 2256 } 2189 - /* Setup the expected power state transition */ 2190 - return nvme_trans_power_state(ns, hdr, pc, pcmod, start); 2257 + 2258 + return 0; 2191 2259 } 2192 2260 } 2193 2261
+88
drivers/nvme/target/admin-cmd.c
··· 14 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 15 #include <linux/module.h> 16 16 #include <generated/utsrelease.h> 17 + #include <asm/unaligned.h> 17 18 #include "nvmet.h" 18 19 19 20 u32 nvmet_get_log_page_len(struct nvme_command *cmd) ··· 30 29 return len; 31 30 } 32 31 32 + static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, 33 + struct nvme_smart_log *slog) 34 + { 35 + u16 status; 36 + struct nvmet_ns *ns; 37 + u64 host_reads, host_writes, data_units_read, data_units_written; 38 + 39 + status = NVME_SC_SUCCESS; 40 + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); 41 + if (!ns) { 42 + status = NVME_SC_INVALID_NS; 43 + pr_err("nvmet : Counld not find namespace id : %d\n", 44 + le32_to_cpu(req->cmd->get_log_page.nsid)); 45 + goto out; 46 + } 47 + 48 + host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); 49 + data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); 50 + host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); 51 + data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); 52 + 53 + put_unaligned_le64(host_reads, &slog->host_reads[0]); 54 + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); 55 + put_unaligned_le64(host_writes, &slog->host_writes[0]); 56 + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); 57 + nvmet_put_namespace(ns); 58 + out: 59 + return status; 60 + } 61 + 62 + static u16 nvmet_get_smart_log_all(struct nvmet_req *req, 63 + struct nvme_smart_log *slog) 64 + { 65 + u16 status; 66 + u64 host_reads = 0, host_writes = 0; 67 + u64 data_units_read = 0, data_units_written = 0; 68 + struct nvmet_ns *ns; 69 + struct nvmet_ctrl *ctrl; 70 + 71 + status = NVME_SC_SUCCESS; 72 + ctrl = req->sq->ctrl; 73 + 74 + rcu_read_lock(); 75 + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { 76 + host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); 77 + data_units_read += 78 + part_stat_read(ns->bdev->bd_part, sectors[READ]); 79 + host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); 80 + data_units_written += 81 + part_stat_read(ns->bdev->bd_part, sectors[WRITE]); 82 + 83 + } 84 + rcu_read_unlock(); 85 + 86 + put_unaligned_le64(host_reads, &slog->host_reads[0]); 87 + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); 88 + put_unaligned_le64(host_writes, &slog->host_writes[0]); 89 + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); 90 + 91 + return status; 92 + } 93 + 94 + static u16 nvmet_get_smart_log(struct nvmet_req *req, 95 + struct nvme_smart_log *slog) 96 + { 97 + u16 status; 98 + 99 + WARN_ON(req == NULL || slog == NULL); 100 + if (req->cmd->get_log_page.nsid == 0xFFFFFFFF) 101 + status = nvmet_get_smart_log_all(req, slog); 102 + else 103 + status = nvmet_get_smart_log_nsid(req, slog); 104 + return status; 105 + } 106 + 33 107 static void nvmet_execute_get_log_page(struct nvmet_req *req) 34 108 { 109 + struct nvme_smart_log *smart_log; 35 110 size_t data_len = nvmet_get_log_page_len(req->cmd); 36 111 void *buf; 37 112 u16 status = 0; ··· 136 59 * available (e.g. units or commands read/written) those aren't 137 60 * persistent over power loss. 138 61 */ 62 + if (data_len != sizeof(*smart_log)) { 63 + status = NVME_SC_INTERNAL; 64 + goto err; 65 + } 66 + smart_log = buf; 67 + status = nvmet_get_smart_log(req, smart_log); 68 + if (status) { 69 + memset(buf, '\0', data_len); 70 + goto err; 71 + } 139 72 break; 140 73 case 0x03: 141 74 /* ··· 160 73 161 74 status = nvmet_copy_to_sgl(req, 0, buf, data_len); 162 75 76 + err: 163 77 kfree(buf); 164 78 out: 165 79 nvmet_req_complete(req, status);
+2 -1
drivers/nvme/target/io-cmd.c
··· 58 58 59 59 if (req->cmd->rw.opcode == nvme_cmd_write) { 60 60 op = REQ_OP_WRITE; 61 + op_flags = WRITE_ODIRECT; 61 62 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) 62 63 op_flags |= REQ_FUA; 63 64 } else { ··· 206 205 return 0; 207 206 case nvme_cmd_dsm: 208 207 req->execute = nvmet_execute_dsm; 209 - req->data_len = le32_to_cpu(cmd->dsm.nr) * 208 + req->data_len = le32_to_cpu(cmd->dsm.nr + 1) * 210 209 sizeof(struct nvme_dsm_range); 211 210 return 0; 212 211 default:
+1 -1
fs/befs/linuxvfs.c
··· 789 789 * Will be set to real fs blocksize later. 790 790 * 791 791 * Linux 2.4.10 and later refuse to read blocks smaller than 792 - * the hardsect size for the device. But we also need to read at 792 + * the logical block size for the device. But we also need to read at 793 793 * least 1k to get the second 512 bytes of the volume. 794 794 * -WD 10-26-01 795 795 */
+3 -15
fs/block_dev.c
··· 180 180 struct file *file = iocb->ki_filp; 181 181 struct inode *inode = bdev_file_inode(file); 182 182 183 - if (IS_DAX(inode)) 184 - return dax_do_io(iocb, inode, iter, blkdev_get_block, 185 - NULL, DIO_SKIP_DIO_COUNT); 186 183 return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, 187 184 blkdev_get_block, NULL, NULL, 188 185 DIO_SKIP_DIO_COUNT); ··· 299 302 error = sb->s_op->thaw_super(sb); 300 303 else 301 304 error = thaw_super(sb); 302 - if (error) { 305 + if (error) 303 306 bdev->bd_fsfreeze_count++; 304 - mutex_unlock(&bdev->bd_fsfreeze_mutex); 305 - return error; 306 - } 307 307 out: 308 308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 309 - return 0; 309 + return error; 310 310 } 311 311 EXPORT_SYMBOL(thaw_bdev); 312 312 ··· 1269 1275 bdev->bd_disk = disk; 1270 1276 bdev->bd_queue = disk->queue; 1271 1277 bdev->bd_contains = bdev; 1272 - bdev->bd_inode->i_flags = 0; 1273 1278 1274 1279 if (!partno) { 1275 1280 ret = -ENXIO; ··· 1296 1303 } 1297 1304 } 1298 1305 1299 - if (!ret) { 1306 + if (!ret) 1300 1307 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1301 - if (!bdev_dax_capable(bdev)) 1302 - bdev->bd_inode->i_flags &= ~S_DAX; 1303 - } 1304 1308 1305 1309 /* 1306 1310 * If the device is invalidated, rescan partition ··· 1332 1342 goto out_clear; 1333 1343 } 1334 1344 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1335 - if (!bdev_dax_capable(bdev)) 1336 - bdev->bd_inode->i_flags &= ~S_DAX; 1337 1345 } 1338 1346 } else { 1339 1347 if (bdev->bd_contains == bdev) {
+3 -2
fs/btrfs/inode.c
··· 8412 8412 if (!bio) 8413 8413 return -ENOMEM; 8414 8414 8415 - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); 8415 + bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); 8416 8416 bio->bi_private = dip; 8417 8417 bio->bi_end_io = btrfs_end_dio_bio; 8418 8418 btrfs_io_bio(bio)->logical = file_offset; ··· 8450 8450 start_sector, GFP_NOFS); 8451 8451 if (!bio) 8452 8452 goto out_err; 8453 - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); 8453 + bio_set_op_attrs(bio, bio_op(orig_bio), 8454 + bio_flags(orig_bio)); 8454 8455 bio->bi_private = dip; 8455 8456 bio->bi_end_io = btrfs_end_dio_bio; 8456 8457 btrfs_io_bio(bio)->logical = file_offset;
+1 -2
include/linux/bio.h
··· 1 1 /* 2 - * 2.5 block I/O model 3 - * 4 2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 5 3 * 6 4 * This program is free software; you can redistribute it and/or modify ··· 459 461 460 462 extern void bio_copy_data(struct bio *dst, struct bio *src); 461 463 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); 464 + extern void bio_free_pages(struct bio *bio); 462 465 463 466 extern struct bio *bio_copy_user_iov(struct request_queue *, 464 467 struct rq_map_data *,
+1 -1
include/linux/blk-cgroup.h
··· 45 45 spinlock_t lock; 46 46 47 47 struct radix_tree_root blkg_tree; 48 - struct blkcg_gq *blkg_hint; 48 + struct blkcg_gq __rcu *blkg_hint; 49 49 struct hlist_head blkg_list; 50 50 51 51 struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
+13 -15
include/linux/blk-mq.h
··· 2 2 #define BLK_MQ_H 3 3 4 4 #include <linux/blkdev.h> 5 + #include <linux/sbitmap.h> 5 6 6 7 struct blk_mq_tags; 7 8 struct blk_flush_queue; ··· 13 12 int (*notify)(void *data, unsigned long action, unsigned int cpu); 14 13 }; 15 14 16 - struct blk_mq_ctxmap { 17 - unsigned int size; 18 - unsigned int bits_per_word; 19 - struct blk_align_bitmap *map; 20 - }; 21 - 22 15 struct blk_mq_hw_ctx { 23 16 struct { 24 17 spinlock_t lock; 25 18 struct list_head dispatch; 19 + unsigned long state; /* BLK_MQ_S_* flags */ 26 20 } ____cacheline_aligned_in_smp; 27 21 28 - unsigned long state; /* BLK_MQ_S_* flags */ 29 - struct delayed_work run_work; 30 - struct delayed_work delay_work; 22 + struct work_struct run_work; 31 23 cpumask_var_t cpumask; 32 24 int next_cpu; 33 25 int next_cpu_batch; ··· 32 38 33 39 void *driver_data; 34 40 35 - struct blk_mq_ctxmap ctx_map; 41 + struct sbitmap ctx_map; 36 42 37 - unsigned int nr_ctx; 38 43 struct blk_mq_ctx **ctxs; 44 + unsigned int nr_ctx; 39 45 40 46 atomic_t wait_index; 41 47 ··· 43 49 44 50 unsigned long queued; 45 51 unsigned long run; 46 - #define BLK_MQ_MAX_DISPATCH_ORDER 10 52 + #define BLK_MQ_MAX_DISPATCH_ORDER 7 47 53 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 48 54 49 55 unsigned int numa_node; ··· 51 57 52 58 atomic_t nr_active; 53 59 60 + struct delayed_work delay_work; 61 + 54 62 struct blk_mq_cpu_notifier cpu_notifier; 55 63 struct kobject kobj; 56 64 65 + unsigned long poll_considered; 57 66 unsigned long poll_invoked; 58 67 unsigned long poll_success; 59 68 }; ··· 155 158 BLK_MQ_F_TAG_SHARED = 1 << 1, 156 159 BLK_MQ_F_SG_MERGE = 1 << 2, 157 160 BLK_MQ_F_DEFER_ISSUE = 1 << 4, 161 + BLK_MQ_F_BLOCKING = 1 << 5, 158 162 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 159 163 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 160 164 ··· 176 178 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 177 179 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 178 180 struct request_queue *q); 179 - int blk_mq_register_disk(struct gendisk *); 180 - void blk_mq_unregister_disk(struct gendisk *); 181 + int blk_mq_register_dev(struct device *, struct request_queue *); 182 + void blk_mq_unregister_dev(struct device *, struct request_queue *); 181 183 182 184 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 183 185 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); ··· 219 221 } 220 222 221 223 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 222 - struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); 223 224 224 225 int blk_mq_request_started(struct request *rq); 225 226 void blk_mq_start_request(struct request *rq); ··· 229 232 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); 230 233 void blk_mq_cancel_requeue_work(struct request_queue *q); 231 234 void blk_mq_kick_requeue_list(struct request_queue *q); 235 + void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 232 236 void blk_mq_abort_requeue_list(struct request_queue *q); 233 237 void blk_mq_complete_request(struct request *rq, int error); 234 238
+14 -7
include/linux/blk_types.h
··· 16 16 struct io_context; 17 17 struct cgroup_subsys_state; 18 18 typedef void (bio_end_io_t) (struct bio *); 19 - typedef void (bio_destructor_t) (struct bio *); 20 19 21 20 #ifdef CONFIG_BLOCK 22 21 /* ··· 88 89 struct bio_vec bi_inline_vecs[0]; 89 90 }; 90 91 91 - #define BIO_OP_SHIFT (8 * sizeof(unsigned int) - REQ_OP_BITS) 92 + #define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) 93 + #define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) 92 94 #define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) 93 95 94 - #define bio_set_op_attrs(bio, op, op_flags) do { \ 95 - WARN_ON(op >= (1 << REQ_OP_BITS)); \ 96 - (bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1); \ 97 - (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \ 98 - (bio)->bi_opf |= op_flags; \ 96 + #define bio_set_op_attrs(bio, op, op_flags) do { \ 97 + if (__builtin_constant_p(op)) \ 98 + BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ 99 + else \ 100 + WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ 101 + if (__builtin_constant_p(op_flags)) \ 102 + BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ 103 + else \ 104 + WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ 105 + (bio)->bi_opf = bio_flags(bio); \ 106 + (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ 107 + (bio)->bi_opf |= (op_flags); \ 99 108 } while (0) 100 109 101 110 #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
+2 -2
include/linux/blkdev.h
··· 449 449 450 450 struct list_head requeue_list; 451 451 spinlock_t requeue_lock; 452 - struct work_struct requeue_work; 452 + struct delayed_work requeue_work; 453 453 454 454 struct mutex sysfs_lock; 455 455 ··· 1440 1440 return bio_will_gap(req->q, bio, req->bio); 1441 1441 } 1442 1442 1443 - struct work_struct; 1444 1443 int kblockd_schedule_work(struct work_struct *work); 1444 + int kblockd_schedule_work_on(int cpu, struct work_struct *work); 1445 1445 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); 1446 1446 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); 1447 1447
-1
include/linux/ioprio.h
··· 7 7 /* 8 8 * Gives us 8 prio classes with 13-bits of data for each class 9 9 */ 10 - #define IOPRIO_BITS (16) 11 10 #define IOPRIO_CLASS_SHIFT (13) 12 11 #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) 13 12
+12 -6
include/linux/lightnvm.h
··· 352 352 353 353 /* Backend device */ 354 354 struct request_queue *q; 355 + struct device dev; 356 + struct device *parent_dev; 355 357 char name[DISK_NAME_LEN]; 358 + void *private_data; 356 359 357 360 struct mutex mlock; 358 361 spinlock_t lock; ··· 527 524 unsigned long); 528 525 extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); 529 526 530 - extern int nvm_register(struct request_queue *, char *, 531 - struct nvm_dev_ops *); 532 - extern void nvm_unregister(char *); 527 + extern struct nvm_dev *nvm_alloc_dev(int); 528 + extern int nvm_register(struct nvm_dev *); 529 + extern void nvm_unregister(struct nvm_dev *); 533 530 534 531 void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); 535 532 ··· 578 575 #else /* CONFIG_NVM */ 579 576 struct nvm_dev_ops; 580 577 581 - static inline int nvm_register(struct request_queue *q, char *disk_name, 582 - struct nvm_dev_ops *ops) 578 + static inline struct nvm_dev *nvm_alloc_dev(int node) 579 + { 580 + return ERR_PTR(-EINVAL); 581 + } 582 + static inline int nvm_register(struct nvm_dev *dev) 583 583 { 584 584 return -EINVAL; 585 585 } 586 - static inline void nvm_unregister(char *disk_name) {} 586 + static inline void nvm_unregister(struct nvm_dev *dev) {} 587 587 #endif /* CONFIG_NVM */ 588 588 #endif /* LIGHTNVM.H */
+373
include/linux/sbitmap.h
··· 1 + /* 2 + * Fast and scalable bitmaps. 3 + * 4 + * Copyright (C) 2016 Facebook 5 + * Copyright (C) 2013-2014 Jens Axboe 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public 9 + * License v2 as published by the Free Software Foundation. 10 + * 11 + * This program is distributed in the hope that it will be useful, 12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 + * General Public License for more details. 15 + * 16 + * You should have received a copy of the GNU General Public License 17 + * along with this program. If not, see <https://www.gnu.org/licenses/>. 18 + */ 19 + 20 + #ifndef __LINUX_SCALE_BITMAP_H 21 + #define __LINUX_SCALE_BITMAP_H 22 + 23 + #include <linux/kernel.h> 24 + #include <linux/slab.h> 25 + 26 + /** 27 + * struct sbitmap_word - Word in a &struct sbitmap. 28 + */ 29 + struct sbitmap_word { 30 + /** 31 + * @word: The bitmap word itself. 32 + */ 33 + unsigned long word; 34 + 35 + /** 36 + * @depth: Number of bits being used in @word. 37 + */ 38 + unsigned long depth; 39 + } ____cacheline_aligned_in_smp; 40 + 41 + /** 42 + * struct sbitmap - Scalable bitmap. 43 + * 44 + * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This 45 + * trades off higher memory usage for better scalability. 46 + */ 47 + struct sbitmap { 48 + /** 49 + * @depth: Number of bits used in the whole bitmap. 50 + */ 51 + unsigned int depth; 52 + 53 + /** 54 + * @shift: log2(number of bits used per word) 55 + */ 56 + unsigned int shift; 57 + 58 + /** 59 + * @map_nr: Number of words (cachelines) being used for the bitmap. 60 + */ 61 + unsigned int map_nr; 62 + 63 + /** 64 + * @map: Allocated bitmap. 65 + */ 66 + struct sbitmap_word *map; 67 + }; 68 + 69 + #define SBQ_WAIT_QUEUES 8 70 + #define SBQ_WAKE_BATCH 8 71 + 72 + /** 73 + * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. 74 + */ 75 + struct sbq_wait_state { 76 + /** 77 + * @wait_cnt: Number of frees remaining before we wake up. 78 + */ 79 + atomic_t wait_cnt; 80 + 81 + /** 82 + * @wait: Wait queue. 83 + */ 84 + wait_queue_head_t wait; 85 + } ____cacheline_aligned_in_smp; 86 + 87 + /** 88 + * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free 89 + * bits. 90 + * 91 + * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to 92 + * avoid contention on the wait queue spinlock. This ensures that we don't hit a 93 + * scalability wall when we run out of free bits and have to start putting tasks 94 + * to sleep. 95 + */ 96 + struct sbitmap_queue { 97 + /** 98 + * @sb: Scalable bitmap. 99 + */ 100 + struct sbitmap sb; 101 + 102 + /* 103 + * @alloc_hint: Cache of last successfully allocated or freed bit. 104 + * 105 + * This is per-cpu, which allows multiple users to stick to different 106 + * cachelines until the map is exhausted. 107 + */ 108 + unsigned int __percpu *alloc_hint; 109 + 110 + /** 111 + * @wake_batch: Number of bits which must be freed before we wake up any 112 + * waiters. 113 + */ 114 + unsigned int wake_batch; 115 + 116 + /** 117 + * @wake_index: Next wait queue in @ws to wake up. 118 + */ 119 + atomic_t wake_index; 120 + 121 + /** 122 + * @ws: Wait queues. 123 + */ 124 + struct sbq_wait_state *ws; 125 + 126 + /** 127 + * @round_robin: Allocate bits in strict round-robin order. 128 + */ 129 + bool round_robin; 130 + }; 131 + 132 + /** 133 + * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. 134 + * @sb: Bitmap to initialize. 135 + * @depth: Number of bits to allocate. 136 + * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if 137 + * given, a good default is chosen. 138 + * @flags: Allocation flags. 139 + * @node: Memory node to allocate on. 140 + * 141 + * Return: Zero on success or negative errno on failure. 142 + */ 143 + int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, 144 + gfp_t flags, int node); 145 + 146 + /** 147 + * sbitmap_free() - Free memory used by a &struct sbitmap. 148 + * @sb: Bitmap to free. 149 + */ 150 + static inline void sbitmap_free(struct sbitmap *sb) 151 + { 152 + kfree(sb->map); 153 + sb->map = NULL; 154 + } 155 + 156 + /** 157 + * sbitmap_resize() - Resize a &struct sbitmap. 158 + * @sb: Bitmap to resize. 159 + * @depth: New number of bits to resize to. 160 + * 161 + * Doesn't reallocate anything. It's up to the caller to ensure that the new 162 + * depth doesn't exceed the depth that the sb was initialized with. 163 + */ 164 + void sbitmap_resize(struct sbitmap *sb, unsigned int depth); 165 + 166 + /** 167 + * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. 168 + * @sb: Bitmap to allocate from. 169 + * @alloc_hint: Hint for where to start searching for a free bit. 170 + * @round_robin: If true, be stricter about allocation order; always allocate 171 + * starting from the last allocated bit. This is less efficient 172 + * than the default behavior (false). 173 + * 174 + * Return: Non-negative allocated bit number if successful, -1 otherwise. 175 + */ 176 + int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); 177 + 178 + /** 179 + * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. 180 + * @sb: Bitmap to check. 181 + * 182 + * Return: true if any bit in the bitmap is set, false otherwise. 183 + */ 184 + bool sbitmap_any_bit_set(const struct sbitmap *sb); 185 + 186 + /** 187 + * sbitmap_any_bit_clear() - Check for an unset bit in a &struct 188 + * sbitmap. 189 + * @sb: Bitmap to check. 190 + * 191 + * Return: true if any bit in the bitmap is clear, false otherwise. 192 + */ 193 + bool sbitmap_any_bit_clear(const struct sbitmap *sb); 194 + 195 + typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); 196 + 197 + /** 198 + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. 199 + * @sb: Bitmap to iterate over. 200 + * @fn: Callback. Should return true to continue or false to break early. 201 + * @data: Pointer to pass to callback. 202 + * 203 + * This is inline even though it's non-trivial so that the function calls to the 204 + * callback will hopefully get optimized away. 205 + */ 206 + static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, 207 + void *data) 208 + { 209 + unsigned int i; 210 + 211 + for (i = 0; i < sb->map_nr; i++) { 212 + struct sbitmap_word *word = &sb->map[i]; 213 + unsigned int off, nr; 214 + 215 + if (!word->word) 216 + continue; 217 + 218 + nr = 0; 219 + off = i << sb->shift; 220 + while (1) { 221 + nr = find_next_bit(&word->word, word->depth, nr); 222 + if (nr >= word->depth) 223 + break; 224 + 225 + if (!fn(sb, off + nr, data)) 226 + return; 227 + 228 + nr++; 229 + } 230 + } 231 + } 232 + 233 + #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) 234 + #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) 235 + 236 + static inline unsigned long *__sbitmap_word(struct sbitmap *sb, 237 + unsigned int bitnr) 238 + { 239 + return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; 240 + } 241 + 242 + /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ 243 + 244 + static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) 245 + { 246 + set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); 247 + } 248 + 249 + static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) 250 + { 251 + clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); 252 + } 253 + 254 + static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) 255 + { 256 + return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); 257 + } 258 + 259 + unsigned int sbitmap_weight(const struct sbitmap *sb); 260 + 261 + /** 262 + * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific 263 + * memory node. 264 + * @sbq: Bitmap queue to initialize. 265 + * @depth: See sbitmap_init_node(). 266 + * @shift: See sbitmap_init_node(). 267 + * @round_robin: See sbitmap_get(). 268 + * @flags: Allocation flags. 269 + * @node: Memory node to allocate on. 270 + * 271 + * Return: Zero on success or negative errno on failure. 272 + */ 273 + int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, 274 + int shift, bool round_robin, gfp_t flags, int node); 275 + 276 + /** 277 + * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. 278 + * 279 + * @sbq: Bitmap queue to free. 280 + */ 281 + static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) 282 + { 283 + kfree(sbq->ws); 284 + free_percpu(sbq->alloc_hint); 285 + sbitmap_free(&sbq->sb); 286 + } 287 + 288 + /** 289 + * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. 290 + * @sbq: Bitmap queue to resize. 291 + * @depth: New number of bits to resize to. 292 + * 293 + * Like sbitmap_resize(), this doesn't reallocate anything. It has to do 294 + * some extra work on the &struct sbitmap_queue, so it's not safe to just 295 + * resize the underlying &struct sbitmap. 296 + */ 297 + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); 298 + 299 + /** 300 + * __sbitmap_queue_get() - Try to allocate a free bit from a &struct 301 + * sbitmap_queue with preemption already disabled. 302 + * @sbq: Bitmap queue to allocate from. 303 + * 304 + * Return: Non-negative allocated bit number if successful, -1 otherwise. 305 + */ 306 + int __sbitmap_queue_get(struct sbitmap_queue *sbq); 307 + 308 + /** 309 + * sbitmap_queue_get() - Try to allocate a free bit from a &struct 310 + * sbitmap_queue. 311 + * @sbq: Bitmap queue to allocate from. 312 + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to 313 + * sbitmap_queue_clear()). 314 + * 315 + * Return: Non-negative allocated bit number if successful, -1 otherwise. 316 + */ 317 + static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, 318 + unsigned int *cpu) 319 + { 320 + int nr; 321 + 322 + *cpu = get_cpu(); 323 + nr = __sbitmap_queue_get(sbq); 324 + put_cpu(); 325 + return nr; 326 + } 327 + 328 + /** 329 + * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a 330 + * &struct sbitmap_queue. 331 + * @sbq: Bitmap to free from. 332 + * @nr: Bit number to free. 333 + * @cpu: CPU the bit was allocated on. 334 + */ 335 + void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, 336 + unsigned int cpu); 337 + 338 + static inline int sbq_index_inc(int index) 339 + { 340 + return (index + 1) & (SBQ_WAIT_QUEUES - 1); 341 + } 342 + 343 + static inline void sbq_index_atomic_inc(atomic_t *index) 344 + { 345 + int old = atomic_read(index); 346 + int new = sbq_index_inc(old); 347 + atomic_cmpxchg(index, old, new); 348 + } 349 + 350 + /** 351 + * sbq_wait_ptr() - Get the next wait queue to use for a &struct 352 + * sbitmap_queue. 353 + * @sbq: Bitmap queue to wait on. 354 + * @wait_index: A counter per "user" of @sbq. 355 + */ 356 + static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, 357 + atomic_t *wait_index) 358 + { 359 + struct sbq_wait_state *ws; 360 + 361 + ws = &sbq->ws[atomic_read(wait_index)]; 362 + sbq_index_atomic_inc(wait_index); 363 + return ws; 364 + } 365 + 366 + /** 367 + * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct 368 + * sbitmap_queue. 369 + * @sbq: Bitmap queue to wake up. 370 + */ 371 + void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); 372 + 373 + #endif /* __LINUX_SCALE_BITMAP_H */
+1
include/linux/workqueue.h
··· 442 442 int execute_in_process_context(work_func_t fn, struct execute_work *); 443 443 444 444 extern bool flush_work(struct work_struct *work); 445 + extern bool cancel_work(struct work_struct *work); 445 446 extern bool cancel_work_sync(struct work_struct *work); 446 447 447 448 extern bool flush_delayed_work(struct delayed_work *dwork);
+26 -14
kernel/workqueue.c
··· 2974 2974 } 2975 2975 EXPORT_SYMBOL(flush_delayed_work); 2976 2976 2977 + static bool __cancel_work(struct work_struct *work, bool is_dwork) 2978 + { 2979 + unsigned long flags; 2980 + int ret; 2981 + 2982 + do { 2983 + ret = try_to_grab_pending(work, is_dwork, &flags); 2984 + } while (unlikely(ret == -EAGAIN)); 2985 + 2986 + if (unlikely(ret < 0)) 2987 + return false; 2988 + 2989 + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); 2990 + local_irq_restore(flags); 2991 + return ret; 2992 + } 2993 + 2994 + /* 2995 + * See cancel_delayed_work() 2996 + */ 2997 + bool cancel_work(struct work_struct *work) 2998 + { 2999 + return __cancel_work(work, false); 3000 + } 3001 + 2977 3002 /** 2978 3003 * cancel_delayed_work - cancel a delayed work 2979 3004 * @dwork: delayed_work to cancel ··· 3017 2992 */ 3018 2993 bool cancel_delayed_work(struct delayed_work *dwork) 3019 2994 { 3020 - unsigned long flags; 3021 - int ret; 3022 - 3023 - do { 3024 - ret = try_to_grab_pending(&dwork->work, true, &flags); 3025 - } while (unlikely(ret == -EAGAIN)); 3026 - 3027 - if (unlikely(ret < 0)) 3028 - return false; 3029 - 3030 - set_work_pool_and_clear_pending(&dwork->work, 3031 - get_work_pool_id(&dwork->work)); 3032 - local_irq_restore(flags); 3033 - return ret; 2995 + return __cancel_work(&dwork->work, true); 3034 2996 } 3035 2997 EXPORT_SYMBOL(cancel_delayed_work); 3036 2998
+3
lib/Kconfig
··· 550 550 bool 551 551 select STACKTRACE 552 552 553 + config SBITMAP 554 + bool 555 + 553 556 endmenu
+2
lib/Makefile
··· 227 227 obj-$(CONFIG_UBSAN) += ubsan.o 228 228 229 229 UBSAN_SANITIZE_ubsan.o := n 230 + 231 + obj-$(CONFIG_SBITMAP) += sbitmap.o
+347
lib/sbitmap.c
··· 1 + /* 2 + * Copyright (C) 2016 Facebook 3 + * Copyright (C) 2013-2014 Jens Axboe 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public 7 + * License v2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public License 15 + * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 + */ 17 + 18 + #include <linux/random.h> 19 + #include <linux/sbitmap.h> 20 + 21 + int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, 22 + gfp_t flags, int node) 23 + { 24 + unsigned int bits_per_word; 25 + unsigned int i; 26 + 27 + if (shift < 0) { 28 + shift = ilog2(BITS_PER_LONG); 29 + /* 30 + * If the bitmap is small, shrink the number of bits per word so 31 + * we spread over a few cachelines, at least. If less than 4 32 + * bits, just forget about it, it's not going to work optimally 33 + * anyway. 34 + */ 35 + if (depth >= 4) { 36 + while ((4U << shift) > depth) 37 + shift--; 38 + } 39 + } 40 + bits_per_word = 1U << shift; 41 + if (bits_per_word > BITS_PER_LONG) 42 + return -EINVAL; 43 + 44 + sb->shift = shift; 45 + sb->depth = depth; 46 + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 47 + 48 + if (depth == 0) { 49 + sb->map = NULL; 50 + return 0; 51 + } 52 + 53 + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); 54 + if (!sb->map) 55 + return -ENOMEM; 56 + 57 + for (i = 0; i < sb->map_nr; i++) { 58 + sb->map[i].depth = min(depth, bits_per_word); 59 + depth -= sb->map[i].depth; 60 + } 61 + return 0; 62 + } 63 + EXPORT_SYMBOL_GPL(sbitmap_init_node); 64 + 65 + void sbitmap_resize(struct sbitmap *sb, unsigned int depth) 66 + { 67 + unsigned int bits_per_word = 1U << sb->shift; 68 + unsigned int i; 69 + 70 + sb->depth = depth; 71 + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 72 + 73 + for (i = 0; i < sb->map_nr; i++) { 74 + sb->map[i].depth = min(depth, bits_per_word); 75 + depth -= sb->map[i].depth; 76 + } 77 + } 78 + EXPORT_SYMBOL_GPL(sbitmap_resize); 79 + 80 + static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, 81 + bool wrap) 82 + { 83 + unsigned int orig_hint = hint; 84 + int nr; 85 + 86 + while (1) { 87 + nr = find_next_zero_bit(&word->word, word->depth, hint); 88 + if (unlikely(nr >= word->depth)) { 89 + /* 90 + * We started with an offset, and we didn't reset the 91 + * offset to 0 in a failure case, so start from 0 to 92 + * exhaust the map. 93 + */ 94 + if (orig_hint && hint && wrap) { 95 + hint = orig_hint = 0; 96 + continue; 97 + } 98 + return -1; 99 + } 100 + 101 + if (!test_and_set_bit(nr, &word->word)) 102 + break; 103 + 104 + hint = nr + 1; 105 + if (hint >= word->depth - 1) 106 + hint = 0; 107 + } 108 + 109 + return nr; 110 + } 111 + 112 + int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) 113 + { 114 + unsigned int i, index; 115 + int nr = -1; 116 + 117 + index = SB_NR_TO_INDEX(sb, alloc_hint); 118 + 119 + for (i = 0; i < sb->map_nr; i++) { 120 + nr = __sbitmap_get_word(&sb->map[index], 121 + SB_NR_TO_BIT(sb, alloc_hint), 122 + !round_robin); 123 + if (nr != -1) { 124 + nr += index << sb->shift; 125 + break; 126 + } 127 + 128 + /* Jump to next index. */ 129 + index++; 130 + alloc_hint = index << sb->shift; 131 + 132 + if (index >= sb->map_nr) { 133 + index = 0; 134 + alloc_hint = 0; 135 + } 136 + } 137 + 138 + return nr; 139 + } 140 + EXPORT_SYMBOL_GPL(sbitmap_get); 141 + 142 + bool sbitmap_any_bit_set(const struct sbitmap *sb) 143 + { 144 + unsigned int i; 145 + 146 + for (i = 0; i < sb->map_nr; i++) { 147 + if (sb->map[i].word) 148 + return true; 149 + } 150 + return false; 151 + } 152 + EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); 153 + 154 + bool sbitmap_any_bit_clear(const struct sbitmap *sb) 155 + { 156 + unsigned int i; 157 + 158 + for (i = 0; i < sb->map_nr; i++) { 159 + const struct sbitmap_word *word = &sb->map[i]; 160 + unsigned long ret; 161 + 162 + ret = find_first_zero_bit(&word->word, word->depth); 163 + if (ret < word->depth) 164 + return true; 165 + } 166 + return false; 167 + } 168 + EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); 169 + 170 + unsigned int sbitmap_weight(const struct sbitmap *sb) 171 + { 172 + unsigned int i, weight = 0; 173 + 174 + for (i = 0; i < sb->map_nr; i++) { 175 + const struct sbitmap_word *word = &sb->map[i]; 176 + 177 + weight += bitmap_weight(&word->word, word->depth); 178 + } 179 + return weight; 180 + } 181 + EXPORT_SYMBOL_GPL(sbitmap_weight); 182 + 183 + static unsigned int sbq_calc_wake_batch(unsigned int depth) 184 + { 185 + unsigned int wake_batch; 186 + 187 + /* 188 + * For each batch, we wake up one queue. We need to make sure that our 189 + * batch size is small enough that the full depth of the bitmap is 190 + * enough to wake up all of the queues. 191 + */ 192 + wake_batch = SBQ_WAKE_BATCH; 193 + if (wake_batch > depth / SBQ_WAIT_QUEUES) 194 + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); 195 + 196 + return wake_batch; 197 + } 198 + 199 + int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, 200 + int shift, bool round_robin, gfp_t flags, int node) 201 + { 202 + int ret; 203 + int i; 204 + 205 + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); 206 + if (ret) 207 + return ret; 208 + 209 + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); 210 + if (!sbq->alloc_hint) { 211 + sbitmap_free(&sbq->sb); 212 + return -ENOMEM; 213 + } 214 + 215 + if (depth && !round_robin) { 216 + for_each_possible_cpu(i) 217 + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; 218 + } 219 + 220 + sbq->wake_batch = sbq_calc_wake_batch(depth); 221 + atomic_set(&sbq->wake_index, 0); 222 + 223 + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); 224 + if (!sbq->ws) { 225 + free_percpu(sbq->alloc_hint); 226 + sbitmap_free(&sbq->sb); 227 + return -ENOMEM; 228 + } 229 + 230 + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 231 + init_waitqueue_head(&sbq->ws[i].wait); 232 + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); 233 + } 234 + 235 + sbq->round_robin = round_robin; 236 + return 0; 237 + } 238 + EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); 239 + 240 + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) 241 + { 242 + sbq->wake_batch = sbq_calc_wake_batch(depth); 243 + sbitmap_resize(&sbq->sb, depth); 244 + } 245 + EXPORT_SYMBOL_GPL(sbitmap_queue_resize); 246 + 247 + int __sbitmap_queue_get(struct sbitmap_queue *sbq) 248 + { 249 + unsigned int hint, depth; 250 + int nr; 251 + 252 + hint = this_cpu_read(*sbq->alloc_hint); 253 + depth = READ_ONCE(sbq->sb.depth); 254 + if (unlikely(hint >= depth)) { 255 + hint = depth ? prandom_u32() % depth : 0; 256 + this_cpu_write(*sbq->alloc_hint, hint); 257 + } 258 + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); 259 + 260 + if (nr == -1) { 261 + /* If the map is full, a hint won't do us much good. */ 262 + this_cpu_write(*sbq->alloc_hint, 0); 263 + } else if (nr == hint || unlikely(sbq->round_robin)) { 264 + /* Only update the hint if we used it. */ 265 + hint = nr + 1; 266 + if (hint >= depth - 1) 267 + hint = 0; 268 + this_cpu_write(*sbq->alloc_hint, hint); 269 + } 270 + 271 + return nr; 272 + } 273 + EXPORT_SYMBOL_GPL(__sbitmap_queue_get); 274 + 275 + static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) 276 + { 277 + int i, wake_index; 278 + 279 + wake_index = atomic_read(&sbq->wake_index); 280 + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 281 + struct sbq_wait_state *ws = &sbq->ws[wake_index]; 282 + 283 + if (waitqueue_active(&ws->wait)) { 284 + int o = atomic_read(&sbq->wake_index); 285 + 286 + if (wake_index != o) 287 + atomic_cmpxchg(&sbq->wake_index, o, wake_index); 288 + return ws; 289 + } 290 + 291 + wake_index = sbq_index_inc(wake_index); 292 + } 293 + 294 + return NULL; 295 + } 296 + 297 + static void sbq_wake_up(struct sbitmap_queue *sbq) 298 + { 299 + struct sbq_wait_state *ws; 300 + int wait_cnt; 301 + 302 + /* Ensure that the wait list checks occur after clear_bit(). */ 303 + smp_mb(); 304 + 305 + ws = sbq_wake_ptr(sbq); 306 + if (!ws) 307 + return; 308 + 309 + wait_cnt = atomic_dec_return(&ws->wait_cnt); 310 + if (unlikely(wait_cnt < 0)) 311 + wait_cnt = atomic_inc_return(&ws->wait_cnt); 312 + if (wait_cnt == 0) { 313 + atomic_add(sbq->wake_batch, &ws->wait_cnt); 314 + sbq_index_atomic_inc(&sbq->wake_index); 315 + wake_up(&ws->wait); 316 + } 317 + } 318 + 319 + void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, 320 + unsigned int cpu) 321 + { 322 + sbitmap_clear_bit(&sbq->sb, nr); 323 + sbq_wake_up(sbq); 324 + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) 325 + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; 326 + } 327 + EXPORT_SYMBOL_GPL(sbitmap_queue_clear); 328 + 329 + void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) 330 + { 331 + int i, wake_index; 332 + 333 + /* 334 + * Make sure all changes prior to this are visible from other CPUs. 335 + */ 336 + smp_mb(); 337 + wake_index = atomic_read(&sbq->wake_index); 338 + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 339 + struct sbq_wait_state *ws = &sbq->ws[wake_index]; 340 + 341 + if (waitqueue_active(&ws->wait)) 342 + wake_up(&ws->wait); 343 + 344 + wake_index = sbq_index_inc(wake_index); 345 + } 346 + } 347 + EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);