Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.15-rc8 2058 lines 49 kB view raw
1/* 2 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and 3 * Shaohua Li <shli@fb.com> 4 */ 5#include <linux/module.h> 6 7#include <linux/moduleparam.h> 8#include <linux/sched.h> 9#include <linux/fs.h> 10#include <linux/blkdev.h> 11#include <linux/init.h> 12#include <linux/slab.h> 13#include <linux/blk-mq.h> 14#include <linux/hrtimer.h> 15#include <linux/lightnvm.h> 16#include <linux/configfs.h> 17#include <linux/badblocks.h> 18 19#define SECTOR_SHIFT 9 20#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 21#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) 22#define SECTOR_SIZE (1 << SECTOR_SHIFT) 23#define SECTOR_MASK (PAGE_SECTORS - 1) 24 25#define FREE_BATCH 16 26 27#define TICKS_PER_SEC 50ULL 28#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) 29 30static inline u64 mb_per_tick(int mbps) 31{ 32 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 33} 34 35struct nullb_cmd { 36 struct list_head list; 37 struct llist_node ll_list; 38 struct __call_single_data csd; 39 struct request *rq; 40 struct bio *bio; 41 unsigned int tag; 42 blk_status_t error; 43 struct nullb_queue *nq; 44 struct hrtimer timer; 45}; 46 47struct nullb_queue { 48 unsigned long *tag_map; 49 wait_queue_head_t wait; 50 unsigned int queue_depth; 51 struct nullb_device *dev; 52 53 struct nullb_cmd *cmds; 54}; 55 56/* 57 * Status flags for nullb_device. 58 * 59 * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. 60 * UP: Device is currently on and visible in userspace. 61 * THROTTLED: Device is being throttled. 62 * CACHE: Device is using a write-back cache. 63 */ 64enum nullb_device_flags { 65 NULLB_DEV_FL_CONFIGURED = 0, 66 NULLB_DEV_FL_UP = 1, 67 NULLB_DEV_FL_THROTTLED = 2, 68 NULLB_DEV_FL_CACHE = 3, 69}; 70 71/* 72 * nullb_page is a page in memory for nullb devices. 73 * 74 * @page: The page holding the data. 75 * @bitmap: The bitmap represents which sector in the page has data. 76 * Each bit represents one block size. For example, sector 8 77 * will use the 7th bit 78 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache 79 * page is being flushing to storage. FREE means the cache page is freed and 80 * should be skipped from flushing to storage. Please see 81 * null_make_cache_space 82 */ 83struct nullb_page { 84 struct page *page; 85 unsigned long bitmap; 86}; 87#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1) 88#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2) 89 90struct nullb_device { 91 struct nullb *nullb; 92 struct config_item item; 93 struct radix_tree_root data; /* data stored in the disk */ 94 struct radix_tree_root cache; /* disk cache data */ 95 unsigned long flags; /* device flags */ 96 unsigned int curr_cache; 97 struct badblocks badblocks; 98 99 unsigned long size; /* device size in MB */ 100 unsigned long completion_nsec; /* time in ns to complete a request */ 101 unsigned long cache_size; /* disk cache size in MB */ 102 unsigned int submit_queues; /* number of submission queues */ 103 unsigned int home_node; /* home node for the device */ 104 unsigned int queue_mode; /* block interface */ 105 unsigned int blocksize; /* block size */ 106 unsigned int irqmode; /* IRQ completion handler */ 107 unsigned int hw_queue_depth; /* queue depth */ 108 unsigned int index; /* index of the disk, only valid with a disk */ 109 unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ 110 bool use_lightnvm; /* register as a LightNVM device */ 111 bool blocking; /* blocking blk-mq device */ 112 bool use_per_node_hctx; /* use per-node allocation for hardware context */ 113 bool power; /* power on/off the device */ 114 bool memory_backed; /* if data is stored in memory */ 115 bool discard; /* if support discard */ 116}; 117 118struct nullb { 119 struct nullb_device *dev; 120 struct list_head list; 121 unsigned int index; 122 struct request_queue *q; 123 struct gendisk *disk; 124 struct nvm_dev *ndev; 125 struct blk_mq_tag_set *tag_set; 126 struct blk_mq_tag_set __tag_set; 127 unsigned int queue_depth; 128 atomic_long_t cur_bytes; 129 struct hrtimer bw_timer; 130 unsigned long cache_flush_pos; 131 spinlock_t lock; 132 133 struct nullb_queue *queues; 134 unsigned int nr_queues; 135 char disk_name[DISK_NAME_LEN]; 136}; 137 138static LIST_HEAD(nullb_list); 139static struct mutex lock; 140static int null_major; 141static DEFINE_IDA(nullb_indexes); 142static struct kmem_cache *ppa_cache; 143static struct blk_mq_tag_set tag_set; 144 145enum { 146 NULL_IRQ_NONE = 0, 147 NULL_IRQ_SOFTIRQ = 1, 148 NULL_IRQ_TIMER = 2, 149}; 150 151enum { 152 NULL_Q_BIO = 0, 153 NULL_Q_RQ = 1, 154 NULL_Q_MQ = 2, 155}; 156 157static int g_no_sched; 158module_param_named(no_sched, g_no_sched, int, S_IRUGO); 159MODULE_PARM_DESC(no_sched, "No io scheduler"); 160 161static int g_submit_queues = 1; 162module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); 163MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 164 165static int g_home_node = NUMA_NO_NODE; 166module_param_named(home_node, g_home_node, int, S_IRUGO); 167MODULE_PARM_DESC(home_node, "Home node for the device"); 168 169static int g_queue_mode = NULL_Q_MQ; 170 171static int null_param_store_val(const char *str, int *val, int min, int max) 172{ 173 int ret, new_val; 174 175 ret = kstrtoint(str, 10, &new_val); 176 if (ret) 177 return -EINVAL; 178 179 if (new_val < min || new_val > max) 180 return -EINVAL; 181 182 *val = new_val; 183 return 0; 184} 185 186static int null_set_queue_mode(const char *str, const struct kernel_param *kp) 187{ 188 return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); 189} 190 191static const struct kernel_param_ops null_queue_mode_param_ops = { 192 .set = null_set_queue_mode, 193 .get = param_get_int, 194}; 195 196device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO); 197MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); 198 199static int g_gb = 250; 200module_param_named(gb, g_gb, int, S_IRUGO); 201MODULE_PARM_DESC(gb, "Size in GB"); 202 203static int g_bs = 512; 204module_param_named(bs, g_bs, int, S_IRUGO); 205MODULE_PARM_DESC(bs, "Block size (in bytes)"); 206 207static int nr_devices = 1; 208module_param(nr_devices, int, S_IRUGO); 209MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 210 211static bool g_use_lightnvm; 212module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO); 213MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); 214 215static bool g_blocking; 216module_param_named(blocking, g_blocking, bool, S_IRUGO); 217MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 218 219static bool shared_tags; 220module_param(shared_tags, bool, S_IRUGO); 221MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); 222 223static int g_irqmode = NULL_IRQ_SOFTIRQ; 224 225static int null_set_irqmode(const char *str, const struct kernel_param *kp) 226{ 227 return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, 228 NULL_IRQ_TIMER); 229} 230 231static const struct kernel_param_ops null_irqmode_param_ops = { 232 .set = null_set_irqmode, 233 .get = param_get_int, 234}; 235 236device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO); 237MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 238 239static unsigned long g_completion_nsec = 10000; 240module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO); 241MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); 242 243static int g_hw_queue_depth = 64; 244module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO); 245MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); 246 247static bool g_use_per_node_hctx; 248module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO); 249MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 250 251static struct nullb_device *null_alloc_dev(void); 252static void null_free_dev(struct nullb_device *dev); 253static void null_del_dev(struct nullb *nullb); 254static int null_add_dev(struct nullb_device *dev); 255static void null_free_device_storage(struct nullb_device *dev, bool is_cache); 256 257static inline struct nullb_device *to_nullb_device(struct config_item *item) 258{ 259 return item ? container_of(item, struct nullb_device, item) : NULL; 260} 261 262static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) 263{ 264 return snprintf(page, PAGE_SIZE, "%u\n", val); 265} 266 267static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, 268 char *page) 269{ 270 return snprintf(page, PAGE_SIZE, "%lu\n", val); 271} 272 273static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) 274{ 275 return snprintf(page, PAGE_SIZE, "%u\n", val); 276} 277 278static ssize_t nullb_device_uint_attr_store(unsigned int *val, 279 const char *page, size_t count) 280{ 281 unsigned int tmp; 282 int result; 283 284 result = kstrtouint(page, 0, &tmp); 285 if (result) 286 return result; 287 288 *val = tmp; 289 return count; 290} 291 292static ssize_t nullb_device_ulong_attr_store(unsigned long *val, 293 const char *page, size_t count) 294{ 295 int result; 296 unsigned long tmp; 297 298 result = kstrtoul(page, 0, &tmp); 299 if (result) 300 return result; 301 302 *val = tmp; 303 return count; 304} 305 306static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, 307 size_t count) 308{ 309 bool tmp; 310 int result; 311 312 result = kstrtobool(page, &tmp); 313 if (result) 314 return result; 315 316 *val = tmp; 317 return count; 318} 319 320/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ 321#define NULLB_DEVICE_ATTR(NAME, TYPE) \ 322static ssize_t \ 323nullb_device_##NAME##_show(struct config_item *item, char *page) \ 324{ \ 325 return nullb_device_##TYPE##_attr_show( \ 326 to_nullb_device(item)->NAME, page); \ 327} \ 328static ssize_t \ 329nullb_device_##NAME##_store(struct config_item *item, const char *page, \ 330 size_t count) \ 331{ \ 332 if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \ 333 return -EBUSY; \ 334 return nullb_device_##TYPE##_attr_store( \ 335 &to_nullb_device(item)->NAME, page, count); \ 336} \ 337CONFIGFS_ATTR(nullb_device_, NAME); 338 339NULLB_DEVICE_ATTR(size, ulong); 340NULLB_DEVICE_ATTR(completion_nsec, ulong); 341NULLB_DEVICE_ATTR(submit_queues, uint); 342NULLB_DEVICE_ATTR(home_node, uint); 343NULLB_DEVICE_ATTR(queue_mode, uint); 344NULLB_DEVICE_ATTR(blocksize, uint); 345NULLB_DEVICE_ATTR(irqmode, uint); 346NULLB_DEVICE_ATTR(hw_queue_depth, uint); 347NULLB_DEVICE_ATTR(index, uint); 348NULLB_DEVICE_ATTR(use_lightnvm, bool); 349NULLB_DEVICE_ATTR(blocking, bool); 350NULLB_DEVICE_ATTR(use_per_node_hctx, bool); 351NULLB_DEVICE_ATTR(memory_backed, bool); 352NULLB_DEVICE_ATTR(discard, bool); 353NULLB_DEVICE_ATTR(mbps, uint); 354NULLB_DEVICE_ATTR(cache_size, ulong); 355 356static ssize_t nullb_device_power_show(struct config_item *item, char *page) 357{ 358 return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); 359} 360 361static ssize_t nullb_device_power_store(struct config_item *item, 362 const char *page, size_t count) 363{ 364 struct nullb_device *dev = to_nullb_device(item); 365 bool newp = false; 366 ssize_t ret; 367 368 ret = nullb_device_bool_attr_store(&newp, page, count); 369 if (ret < 0) 370 return ret; 371 372 if (!dev->power && newp) { 373 if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) 374 return count; 375 if (null_add_dev(dev)) { 376 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 377 return -ENOMEM; 378 } 379 380 set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 381 dev->power = newp; 382 } else if (dev->power && !newp) { 383 mutex_lock(&lock); 384 dev->power = newp; 385 null_del_dev(dev->nullb); 386 mutex_unlock(&lock); 387 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 388 } 389 390 return count; 391} 392 393CONFIGFS_ATTR(nullb_device_, power); 394 395static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) 396{ 397 struct nullb_device *t_dev = to_nullb_device(item); 398 399 return badblocks_show(&t_dev->badblocks, page, 0); 400} 401 402static ssize_t nullb_device_badblocks_store(struct config_item *item, 403 const char *page, size_t count) 404{ 405 struct nullb_device *t_dev = to_nullb_device(item); 406 char *orig, *buf, *tmp; 407 u64 start, end; 408 int ret; 409 410 orig = kstrndup(page, count, GFP_KERNEL); 411 if (!orig) 412 return -ENOMEM; 413 414 buf = strstrip(orig); 415 416 ret = -EINVAL; 417 if (buf[0] != '+' && buf[0] != '-') 418 goto out; 419 tmp = strchr(&buf[1], '-'); 420 if (!tmp) 421 goto out; 422 *tmp = '\0'; 423 ret = kstrtoull(buf + 1, 0, &start); 424 if (ret) 425 goto out; 426 ret = kstrtoull(tmp + 1, 0, &end); 427 if (ret) 428 goto out; 429 ret = -EINVAL; 430 if (start > end) 431 goto out; 432 /* enable badblocks */ 433 cmpxchg(&t_dev->badblocks.shift, -1, 0); 434 if (buf[0] == '+') 435 ret = badblocks_set(&t_dev->badblocks, start, 436 end - start + 1, 1); 437 else 438 ret = badblocks_clear(&t_dev->badblocks, start, 439 end - start + 1); 440 if (ret == 0) 441 ret = count; 442out: 443 kfree(orig); 444 return ret; 445} 446CONFIGFS_ATTR(nullb_device_, badblocks); 447 448static struct configfs_attribute *nullb_device_attrs[] = { 449 &nullb_device_attr_size, 450 &nullb_device_attr_completion_nsec, 451 &nullb_device_attr_submit_queues, 452 &nullb_device_attr_home_node, 453 &nullb_device_attr_queue_mode, 454 &nullb_device_attr_blocksize, 455 &nullb_device_attr_irqmode, 456 &nullb_device_attr_hw_queue_depth, 457 &nullb_device_attr_index, 458 &nullb_device_attr_use_lightnvm, 459 &nullb_device_attr_blocking, 460 &nullb_device_attr_use_per_node_hctx, 461 &nullb_device_attr_power, 462 &nullb_device_attr_memory_backed, 463 &nullb_device_attr_discard, 464 &nullb_device_attr_mbps, 465 &nullb_device_attr_cache_size, 466 &nullb_device_attr_badblocks, 467 NULL, 468}; 469 470static void nullb_device_release(struct config_item *item) 471{ 472 struct nullb_device *dev = to_nullb_device(item); 473 474 null_free_device_storage(dev, false); 475 null_free_dev(dev); 476} 477 478static struct configfs_item_operations nullb_device_ops = { 479 .release = nullb_device_release, 480}; 481 482static const struct config_item_type nullb_device_type = { 483 .ct_item_ops = &nullb_device_ops, 484 .ct_attrs = nullb_device_attrs, 485 .ct_owner = THIS_MODULE, 486}; 487 488static struct 489config_item *nullb_group_make_item(struct config_group *group, const char *name) 490{ 491 struct nullb_device *dev; 492 493 dev = null_alloc_dev(); 494 if (!dev) 495 return ERR_PTR(-ENOMEM); 496 497 config_item_init_type_name(&dev->item, name, &nullb_device_type); 498 499 return &dev->item; 500} 501 502static void 503nullb_group_drop_item(struct config_group *group, struct config_item *item) 504{ 505 struct nullb_device *dev = to_nullb_device(item); 506 507 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 508 mutex_lock(&lock); 509 dev->power = false; 510 null_del_dev(dev->nullb); 511 mutex_unlock(&lock); 512 } 513 514 config_item_put(item); 515} 516 517static ssize_t memb_group_features_show(struct config_item *item, char *page) 518{ 519 return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); 520} 521 522CONFIGFS_ATTR_RO(memb_group_, features); 523 524static struct configfs_attribute *nullb_group_attrs[] = { 525 &memb_group_attr_features, 526 NULL, 527}; 528 529static struct configfs_group_operations nullb_group_ops = { 530 .make_item = nullb_group_make_item, 531 .drop_item = nullb_group_drop_item, 532}; 533 534static const struct config_item_type nullb_group_type = { 535 .ct_group_ops = &nullb_group_ops, 536 .ct_attrs = nullb_group_attrs, 537 .ct_owner = THIS_MODULE, 538}; 539 540static struct configfs_subsystem nullb_subsys = { 541 .su_group = { 542 .cg_item = { 543 .ci_namebuf = "nullb", 544 .ci_type = &nullb_group_type, 545 }, 546 }, 547}; 548 549static inline int null_cache_active(struct nullb *nullb) 550{ 551 return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 552} 553 554static struct nullb_device *null_alloc_dev(void) 555{ 556 struct nullb_device *dev; 557 558 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 559 if (!dev) 560 return NULL; 561 INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); 562 INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); 563 if (badblocks_init(&dev->badblocks, 0)) { 564 kfree(dev); 565 return NULL; 566 } 567 568 dev->size = g_gb * 1024; 569 dev->completion_nsec = g_completion_nsec; 570 dev->submit_queues = g_submit_queues; 571 dev->home_node = g_home_node; 572 dev->queue_mode = g_queue_mode; 573 dev->blocksize = g_bs; 574 dev->irqmode = g_irqmode; 575 dev->hw_queue_depth = g_hw_queue_depth; 576 dev->use_lightnvm = g_use_lightnvm; 577 dev->blocking = g_blocking; 578 dev->use_per_node_hctx = g_use_per_node_hctx; 579 return dev; 580} 581 582static void null_free_dev(struct nullb_device *dev) 583{ 584 if (!dev) 585 return; 586 587 badblocks_exit(&dev->badblocks); 588 kfree(dev); 589} 590 591static void put_tag(struct nullb_queue *nq, unsigned int tag) 592{ 593 clear_bit_unlock(tag, nq->tag_map); 594 595 if (waitqueue_active(&nq->wait)) 596 wake_up(&nq->wait); 597} 598 599static unsigned int get_tag(struct nullb_queue *nq) 600{ 601 unsigned int tag; 602 603 do { 604 tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 605 if (tag >= nq->queue_depth) 606 return -1U; 607 } while (test_and_set_bit_lock(tag, nq->tag_map)); 608 609 return tag; 610} 611 612static void free_cmd(struct nullb_cmd *cmd) 613{ 614 put_tag(cmd->nq, cmd->tag); 615} 616 617static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); 618 619static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 620{ 621 struct nullb_cmd *cmd; 622 unsigned int tag; 623 624 tag = get_tag(nq); 625 if (tag != -1U) { 626 cmd = &nq->cmds[tag]; 627 cmd->tag = tag; 628 cmd->nq = nq; 629 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 630 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, 631 HRTIMER_MODE_REL); 632 cmd->timer.function = null_cmd_timer_expired; 633 } 634 return cmd; 635 } 636 637 return NULL; 638} 639 640static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) 641{ 642 struct nullb_cmd *cmd; 643 DEFINE_WAIT(wait); 644 645 cmd = __alloc_cmd(nq); 646 if (cmd || !can_wait) 647 return cmd; 648 649 do { 650 prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 651 cmd = __alloc_cmd(nq); 652 if (cmd) 653 break; 654 655 io_schedule(); 656 } while (1); 657 658 finish_wait(&nq->wait, &wait); 659 return cmd; 660} 661 662static void end_cmd(struct nullb_cmd *cmd) 663{ 664 struct request_queue *q = NULL; 665 int queue_mode = cmd->nq->dev->queue_mode; 666 667 if (cmd->rq) 668 q = cmd->rq->q; 669 670 switch (queue_mode) { 671 case NULL_Q_MQ: 672 blk_mq_end_request(cmd->rq, cmd->error); 673 return; 674 case NULL_Q_RQ: 675 INIT_LIST_HEAD(&cmd->rq->queuelist); 676 blk_end_request_all(cmd->rq, cmd->error); 677 break; 678 case NULL_Q_BIO: 679 cmd->bio->bi_status = cmd->error; 680 bio_endio(cmd->bio); 681 break; 682 } 683 684 free_cmd(cmd); 685 686 /* Restart queue if needed, as we are freeing a tag */ 687 if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) { 688 unsigned long flags; 689 690 spin_lock_irqsave(q->queue_lock, flags); 691 blk_start_queue_async(q); 692 spin_unlock_irqrestore(q->queue_lock, flags); 693 } 694} 695 696static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 697{ 698 end_cmd(container_of(timer, struct nullb_cmd, timer)); 699 700 return HRTIMER_NORESTART; 701} 702 703static void null_cmd_end_timer(struct nullb_cmd *cmd) 704{ 705 ktime_t kt = cmd->nq->dev->completion_nsec; 706 707 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); 708} 709 710static void null_softirq_done_fn(struct request *rq) 711{ 712 struct nullb *nullb = rq->q->queuedata; 713 714 if (nullb->dev->queue_mode == NULL_Q_MQ) 715 end_cmd(blk_mq_rq_to_pdu(rq)); 716 else 717 end_cmd(rq->special); 718} 719 720static struct nullb_page *null_alloc_page(gfp_t gfp_flags) 721{ 722 struct nullb_page *t_page; 723 724 t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); 725 if (!t_page) 726 goto out; 727 728 t_page->page = alloc_pages(gfp_flags, 0); 729 if (!t_page->page) 730 goto out_freepage; 731 732 t_page->bitmap = 0; 733 return t_page; 734out_freepage: 735 kfree(t_page); 736out: 737 return NULL; 738} 739 740static void null_free_page(struct nullb_page *t_page) 741{ 742 __set_bit(NULLB_PAGE_FREE, &t_page->bitmap); 743 if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap)) 744 return; 745 __free_page(t_page->page); 746 kfree(t_page); 747} 748 749static void null_free_sector(struct nullb *nullb, sector_t sector, 750 bool is_cache) 751{ 752 unsigned int sector_bit; 753 u64 idx; 754 struct nullb_page *t_page, *ret; 755 struct radix_tree_root *root; 756 757 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 758 idx = sector >> PAGE_SECTORS_SHIFT; 759 sector_bit = (sector & SECTOR_MASK); 760 761 t_page = radix_tree_lookup(root, idx); 762 if (t_page) { 763 __clear_bit(sector_bit, &t_page->bitmap); 764 765 if (!t_page->bitmap) { 766 ret = radix_tree_delete_item(root, idx, t_page); 767 WARN_ON(ret != t_page); 768 null_free_page(ret); 769 if (is_cache) 770 nullb->dev->curr_cache -= PAGE_SIZE; 771 } 772 } 773} 774 775static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, 776 struct nullb_page *t_page, bool is_cache) 777{ 778 struct radix_tree_root *root; 779 780 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 781 782 if (radix_tree_insert(root, idx, t_page)) { 783 null_free_page(t_page); 784 t_page = radix_tree_lookup(root, idx); 785 WARN_ON(!t_page || t_page->page->index != idx); 786 } else if (is_cache) 787 nullb->dev->curr_cache += PAGE_SIZE; 788 789 return t_page; 790} 791 792static void null_free_device_storage(struct nullb_device *dev, bool is_cache) 793{ 794 unsigned long pos = 0; 795 int nr_pages; 796 struct nullb_page *ret, *t_pages[FREE_BATCH]; 797 struct radix_tree_root *root; 798 799 root = is_cache ? &dev->cache : &dev->data; 800 801 do { 802 int i; 803 804 nr_pages = radix_tree_gang_lookup(root, 805 (void **)t_pages, pos, FREE_BATCH); 806 807 for (i = 0; i < nr_pages; i++) { 808 pos = t_pages[i]->page->index; 809 ret = radix_tree_delete_item(root, pos, t_pages[i]); 810 WARN_ON(ret != t_pages[i]); 811 null_free_page(ret); 812 } 813 814 pos++; 815 } while (nr_pages == FREE_BATCH); 816 817 if (is_cache) 818 dev->curr_cache = 0; 819} 820 821static struct nullb_page *__null_lookup_page(struct nullb *nullb, 822 sector_t sector, bool for_write, bool is_cache) 823{ 824 unsigned int sector_bit; 825 u64 idx; 826 struct nullb_page *t_page; 827 struct radix_tree_root *root; 828 829 idx = sector >> PAGE_SECTORS_SHIFT; 830 sector_bit = (sector & SECTOR_MASK); 831 832 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 833 t_page = radix_tree_lookup(root, idx); 834 WARN_ON(t_page && t_page->page->index != idx); 835 836 if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap))) 837 return t_page; 838 839 return NULL; 840} 841 842static struct nullb_page *null_lookup_page(struct nullb *nullb, 843 sector_t sector, bool for_write, bool ignore_cache) 844{ 845 struct nullb_page *page = NULL; 846 847 if (!ignore_cache) 848 page = __null_lookup_page(nullb, sector, for_write, true); 849 if (page) 850 return page; 851 return __null_lookup_page(nullb, sector, for_write, false); 852} 853 854static struct nullb_page *null_insert_page(struct nullb *nullb, 855 sector_t sector, bool ignore_cache) 856{ 857 u64 idx; 858 struct nullb_page *t_page; 859 860 t_page = null_lookup_page(nullb, sector, true, ignore_cache); 861 if (t_page) 862 return t_page; 863 864 spin_unlock_irq(&nullb->lock); 865 866 t_page = null_alloc_page(GFP_NOIO); 867 if (!t_page) 868 goto out_lock; 869 870 if (radix_tree_preload(GFP_NOIO)) 871 goto out_freepage; 872 873 spin_lock_irq(&nullb->lock); 874 idx = sector >> PAGE_SECTORS_SHIFT; 875 t_page->page->index = idx; 876 t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); 877 radix_tree_preload_end(); 878 879 return t_page; 880out_freepage: 881 null_free_page(t_page); 882out_lock: 883 spin_lock_irq(&nullb->lock); 884 return null_lookup_page(nullb, sector, true, ignore_cache); 885} 886 887static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) 888{ 889 int i; 890 unsigned int offset; 891 u64 idx; 892 struct nullb_page *t_page, *ret; 893 void *dst, *src; 894 895 idx = c_page->page->index; 896 897 t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); 898 899 __clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap); 900 if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) { 901 null_free_page(c_page); 902 if (t_page && t_page->bitmap == 0) { 903 ret = radix_tree_delete_item(&nullb->dev->data, 904 idx, t_page); 905 null_free_page(t_page); 906 } 907 return 0; 908 } 909 910 if (!t_page) 911 return -ENOMEM; 912 913 src = kmap_atomic(c_page->page); 914 dst = kmap_atomic(t_page->page); 915 916 for (i = 0; i < PAGE_SECTORS; 917 i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { 918 if (test_bit(i, &c_page->bitmap)) { 919 offset = (i << SECTOR_SHIFT); 920 memcpy(dst + offset, src + offset, 921 nullb->dev->blocksize); 922 __set_bit(i, &t_page->bitmap); 923 } 924 } 925 926 kunmap_atomic(dst); 927 kunmap_atomic(src); 928 929 ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); 930 null_free_page(ret); 931 nullb->dev->curr_cache -= PAGE_SIZE; 932 933 return 0; 934} 935 936static int null_make_cache_space(struct nullb *nullb, unsigned long n) 937{ 938 int i, err, nr_pages; 939 struct nullb_page *c_pages[FREE_BATCH]; 940 unsigned long flushed = 0, one_round; 941 942again: 943 if ((nullb->dev->cache_size * 1024 * 1024) > 944 nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) 945 return 0; 946 947 nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, 948 (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); 949 /* 950 * nullb_flush_cache_page could unlock before using the c_pages. To 951 * avoid race, we don't allow page free 952 */ 953 for (i = 0; i < nr_pages; i++) { 954 nullb->cache_flush_pos = c_pages[i]->page->index; 955 /* 956 * We found the page which is being flushed to disk by other 957 * threads 958 */ 959 if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap)) 960 c_pages[i] = NULL; 961 else 962 __set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap); 963 } 964 965 one_round = 0; 966 for (i = 0; i < nr_pages; i++) { 967 if (c_pages[i] == NULL) 968 continue; 969 err = null_flush_cache_page(nullb, c_pages[i]); 970 if (err) 971 return err; 972 one_round++; 973 } 974 flushed += one_round << PAGE_SHIFT; 975 976 if (n > flushed) { 977 if (nr_pages == 0) 978 nullb->cache_flush_pos = 0; 979 if (one_round == 0) { 980 /* give other threads a chance */ 981 spin_unlock_irq(&nullb->lock); 982 spin_lock_irq(&nullb->lock); 983 } 984 goto again; 985 } 986 return 0; 987} 988 989static int copy_to_nullb(struct nullb *nullb, struct page *source, 990 unsigned int off, sector_t sector, size_t n, bool is_fua) 991{ 992 size_t temp, count = 0; 993 unsigned int offset; 994 struct nullb_page *t_page; 995 void *dst, *src; 996 997 while (count < n) { 998 temp = min_t(size_t, nullb->dev->blocksize, n - count); 999 1000 if (null_cache_active(nullb) && !is_fua) 1001 null_make_cache_space(nullb, PAGE_SIZE); 1002 1003 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1004 t_page = null_insert_page(nullb, sector, 1005 !null_cache_active(nullb) || is_fua); 1006 if (!t_page) 1007 return -ENOSPC; 1008 1009 src = kmap_atomic(source); 1010 dst = kmap_atomic(t_page->page); 1011 memcpy(dst + offset, src + off + count, temp); 1012 kunmap_atomic(dst); 1013 kunmap_atomic(src); 1014 1015 __set_bit(sector & SECTOR_MASK, &t_page->bitmap); 1016 1017 if (is_fua) 1018 null_free_sector(nullb, sector, true); 1019 1020 count += temp; 1021 sector += temp >> SECTOR_SHIFT; 1022 } 1023 return 0; 1024} 1025 1026static int copy_from_nullb(struct nullb *nullb, struct page *dest, 1027 unsigned int off, sector_t sector, size_t n) 1028{ 1029 size_t temp, count = 0; 1030 unsigned int offset; 1031 struct nullb_page *t_page; 1032 void *dst, *src; 1033 1034 while (count < n) { 1035 temp = min_t(size_t, nullb->dev->blocksize, n - count); 1036 1037 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1038 t_page = null_lookup_page(nullb, sector, false, 1039 !null_cache_active(nullb)); 1040 1041 dst = kmap_atomic(dest); 1042 if (!t_page) { 1043 memset(dst + off + count, 0, temp); 1044 goto next; 1045 } 1046 src = kmap_atomic(t_page->page); 1047 memcpy(dst + off + count, src + offset, temp); 1048 kunmap_atomic(src); 1049next: 1050 kunmap_atomic(dst); 1051 1052 count += temp; 1053 sector += temp >> SECTOR_SHIFT; 1054 } 1055 return 0; 1056} 1057 1058static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) 1059{ 1060 size_t temp; 1061 1062 spin_lock_irq(&nullb->lock); 1063 while (n > 0) { 1064 temp = min_t(size_t, n, nullb->dev->blocksize); 1065 null_free_sector(nullb, sector, false); 1066 if (null_cache_active(nullb)) 1067 null_free_sector(nullb, sector, true); 1068 sector += temp >> SECTOR_SHIFT; 1069 n -= temp; 1070 } 1071 spin_unlock_irq(&nullb->lock); 1072} 1073 1074static int null_handle_flush(struct nullb *nullb) 1075{ 1076 int err; 1077 1078 if (!null_cache_active(nullb)) 1079 return 0; 1080 1081 spin_lock_irq(&nullb->lock); 1082 while (true) { 1083 err = null_make_cache_space(nullb, 1084 nullb->dev->cache_size * 1024 * 1024); 1085 if (err || nullb->dev->curr_cache == 0) 1086 break; 1087 } 1088 1089 WARN_ON(!radix_tree_empty(&nullb->dev->cache)); 1090 spin_unlock_irq(&nullb->lock); 1091 return err; 1092} 1093 1094static int null_transfer(struct nullb *nullb, struct page *page, 1095 unsigned int len, unsigned int off, bool is_write, sector_t sector, 1096 bool is_fua) 1097{ 1098 int err = 0; 1099 1100 if (!is_write) { 1101 err = copy_from_nullb(nullb, page, off, sector, len); 1102 flush_dcache_page(page); 1103 } else { 1104 flush_dcache_page(page); 1105 err = copy_to_nullb(nullb, page, off, sector, len, is_fua); 1106 } 1107 1108 return err; 1109} 1110 1111static int null_handle_rq(struct nullb_cmd *cmd) 1112{ 1113 struct request *rq = cmd->rq; 1114 struct nullb *nullb = cmd->nq->dev->nullb; 1115 int err; 1116 unsigned int len; 1117 sector_t sector; 1118 struct req_iterator iter; 1119 struct bio_vec bvec; 1120 1121 sector = blk_rq_pos(rq); 1122 1123 if (req_op(rq) == REQ_OP_DISCARD) { 1124 null_handle_discard(nullb, sector, blk_rq_bytes(rq)); 1125 return 0; 1126 } 1127 1128 spin_lock_irq(&nullb->lock); 1129 rq_for_each_segment(bvec, rq, iter) { 1130 len = bvec.bv_len; 1131 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1132 op_is_write(req_op(rq)), sector, 1133 req_op(rq) & REQ_FUA); 1134 if (err) { 1135 spin_unlock_irq(&nullb->lock); 1136 return err; 1137 } 1138 sector += len >> SECTOR_SHIFT; 1139 } 1140 spin_unlock_irq(&nullb->lock); 1141 1142 return 0; 1143} 1144 1145static int null_handle_bio(struct nullb_cmd *cmd) 1146{ 1147 struct bio *bio = cmd->bio; 1148 struct nullb *nullb = cmd->nq->dev->nullb; 1149 int err; 1150 unsigned int len; 1151 sector_t sector; 1152 struct bio_vec bvec; 1153 struct bvec_iter iter; 1154 1155 sector = bio->bi_iter.bi_sector; 1156 1157 if (bio_op(bio) == REQ_OP_DISCARD) { 1158 null_handle_discard(nullb, sector, 1159 bio_sectors(bio) << SECTOR_SHIFT); 1160 return 0; 1161 } 1162 1163 spin_lock_irq(&nullb->lock); 1164 bio_for_each_segment(bvec, bio, iter) { 1165 len = bvec.bv_len; 1166 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1167 op_is_write(bio_op(bio)), sector, 1168 bio_op(bio) & REQ_FUA); 1169 if (err) { 1170 spin_unlock_irq(&nullb->lock); 1171 return err; 1172 } 1173 sector += len >> SECTOR_SHIFT; 1174 } 1175 spin_unlock_irq(&nullb->lock); 1176 return 0; 1177} 1178 1179static void null_stop_queue(struct nullb *nullb) 1180{ 1181 struct request_queue *q = nullb->q; 1182 1183 if (nullb->dev->queue_mode == NULL_Q_MQ) 1184 blk_mq_stop_hw_queues(q); 1185 else { 1186 spin_lock_irq(q->queue_lock); 1187 blk_stop_queue(q); 1188 spin_unlock_irq(q->queue_lock); 1189 } 1190} 1191 1192static void null_restart_queue_async(struct nullb *nullb) 1193{ 1194 struct request_queue *q = nullb->q; 1195 unsigned long flags; 1196 1197 if (nullb->dev->queue_mode == NULL_Q_MQ) 1198 blk_mq_start_stopped_hw_queues(q, true); 1199 else { 1200 spin_lock_irqsave(q->queue_lock, flags); 1201 blk_start_queue_async(q); 1202 spin_unlock_irqrestore(q->queue_lock, flags); 1203 } 1204} 1205 1206static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) 1207{ 1208 struct nullb_device *dev = cmd->nq->dev; 1209 struct nullb *nullb = dev->nullb; 1210 int err = 0; 1211 1212 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { 1213 struct request *rq = cmd->rq; 1214 1215 if (!hrtimer_active(&nullb->bw_timer)) 1216 hrtimer_restart(&nullb->bw_timer); 1217 1218 if (atomic_long_sub_return(blk_rq_bytes(rq), 1219 &nullb->cur_bytes) < 0) { 1220 null_stop_queue(nullb); 1221 /* race with timer */ 1222 if (atomic_long_read(&nullb->cur_bytes) > 0) 1223 null_restart_queue_async(nullb); 1224 if (dev->queue_mode == NULL_Q_RQ) { 1225 struct request_queue *q = nullb->q; 1226 1227 spin_lock_irq(q->queue_lock); 1228 rq->rq_flags |= RQF_DONTPREP; 1229 blk_requeue_request(q, rq); 1230 spin_unlock_irq(q->queue_lock); 1231 return BLK_STS_OK; 1232 } else 1233 /* requeue request */ 1234 return BLK_STS_RESOURCE; 1235 } 1236 } 1237 1238 if (nullb->dev->badblocks.shift != -1) { 1239 int bad_sectors; 1240 sector_t sector, size, first_bad; 1241 bool is_flush = true; 1242 1243 if (dev->queue_mode == NULL_Q_BIO && 1244 bio_op(cmd->bio) != REQ_OP_FLUSH) { 1245 is_flush = false; 1246 sector = cmd->bio->bi_iter.bi_sector; 1247 size = bio_sectors(cmd->bio); 1248 } 1249 if (dev->queue_mode != NULL_Q_BIO && 1250 req_op(cmd->rq) != REQ_OP_FLUSH) { 1251 is_flush = false; 1252 sector = blk_rq_pos(cmd->rq); 1253 size = blk_rq_sectors(cmd->rq); 1254 } 1255 if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector, 1256 size, &first_bad, &bad_sectors)) { 1257 cmd->error = BLK_STS_IOERR; 1258 goto out; 1259 } 1260 } 1261 1262 if (dev->memory_backed) { 1263 if (dev->queue_mode == NULL_Q_BIO) { 1264 if (bio_op(cmd->bio) == REQ_OP_FLUSH) 1265 err = null_handle_flush(nullb); 1266 else 1267 err = null_handle_bio(cmd); 1268 } else { 1269 if (req_op(cmd->rq) == REQ_OP_FLUSH) 1270 err = null_handle_flush(nullb); 1271 else 1272 err = null_handle_rq(cmd); 1273 } 1274 } 1275 cmd->error = errno_to_blk_status(err); 1276out: 1277 /* Complete IO by inline, softirq or timer */ 1278 switch (dev->irqmode) { 1279 case NULL_IRQ_SOFTIRQ: 1280 switch (dev->queue_mode) { 1281 case NULL_Q_MQ: 1282 blk_mq_complete_request(cmd->rq); 1283 break; 1284 case NULL_Q_RQ: 1285 blk_complete_request(cmd->rq); 1286 break; 1287 case NULL_Q_BIO: 1288 /* 1289 * XXX: no proper submitting cpu information available. 1290 */ 1291 end_cmd(cmd); 1292 break; 1293 } 1294 break; 1295 case NULL_IRQ_NONE: 1296 end_cmd(cmd); 1297 break; 1298 case NULL_IRQ_TIMER: 1299 null_cmd_end_timer(cmd); 1300 break; 1301 } 1302 return BLK_STS_OK; 1303} 1304 1305static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) 1306{ 1307 struct nullb *nullb = container_of(timer, struct nullb, bw_timer); 1308 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1309 unsigned int mbps = nullb->dev->mbps; 1310 1311 if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) 1312 return HRTIMER_NORESTART; 1313 1314 atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); 1315 null_restart_queue_async(nullb); 1316 1317 hrtimer_forward_now(&nullb->bw_timer, timer_interval); 1318 1319 return HRTIMER_RESTART; 1320} 1321 1322static void nullb_setup_bwtimer(struct nullb *nullb) 1323{ 1324 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1325 1326 hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1327 nullb->bw_timer.function = nullb_bwtimer_fn; 1328 atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); 1329 hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); 1330} 1331 1332static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 1333{ 1334 int index = 0; 1335 1336 if (nullb->nr_queues != 1) 1337 index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 1338 1339 return &nullb->queues[index]; 1340} 1341 1342static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) 1343{ 1344 struct nullb *nullb = q->queuedata; 1345 struct nullb_queue *nq = nullb_to_queue(nullb); 1346 struct nullb_cmd *cmd; 1347 1348 cmd = alloc_cmd(nq, 1); 1349 cmd->bio = bio; 1350 1351 null_handle_cmd(cmd); 1352 return BLK_QC_T_NONE; 1353} 1354 1355static int null_rq_prep_fn(struct request_queue *q, struct request *req) 1356{ 1357 struct nullb *nullb = q->queuedata; 1358 struct nullb_queue *nq = nullb_to_queue(nullb); 1359 struct nullb_cmd *cmd; 1360 1361 cmd = alloc_cmd(nq, 0); 1362 if (cmd) { 1363 cmd->rq = req; 1364 req->special = cmd; 1365 return BLKPREP_OK; 1366 } 1367 blk_stop_queue(q); 1368 1369 return BLKPREP_DEFER; 1370} 1371 1372static void null_request_fn(struct request_queue *q) 1373{ 1374 struct request *rq; 1375 1376 while ((rq = blk_fetch_request(q)) != NULL) { 1377 struct nullb_cmd *cmd = rq->special; 1378 1379 spin_unlock_irq(q->queue_lock); 1380 null_handle_cmd(cmd); 1381 spin_lock_irq(q->queue_lock); 1382 } 1383} 1384 1385static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, 1386 const struct blk_mq_queue_data *bd) 1387{ 1388 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 1389 struct nullb_queue *nq = hctx->driver_data; 1390 1391 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1392 1393 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 1394 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1395 cmd->timer.function = null_cmd_timer_expired; 1396 } 1397 cmd->rq = bd->rq; 1398 cmd->nq = nq; 1399 1400 blk_mq_start_request(bd->rq); 1401 1402 return null_handle_cmd(cmd); 1403} 1404 1405static const struct blk_mq_ops null_mq_ops = { 1406 .queue_rq = null_queue_rq, 1407 .complete = null_softirq_done_fn, 1408}; 1409 1410static void cleanup_queue(struct nullb_queue *nq) 1411{ 1412 kfree(nq->tag_map); 1413 kfree(nq->cmds); 1414} 1415 1416static void cleanup_queues(struct nullb *nullb) 1417{ 1418 int i; 1419 1420 for (i = 0; i < nullb->nr_queues; i++) 1421 cleanup_queue(&nullb->queues[i]); 1422 1423 kfree(nullb->queues); 1424} 1425 1426#ifdef CONFIG_NVM 1427 1428static void null_lnvm_end_io(struct request *rq, blk_status_t status) 1429{ 1430 struct nvm_rq *rqd = rq->end_io_data; 1431 1432 /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */ 1433 rqd->error = status ? -EIO : 0; 1434 nvm_end_io(rqd); 1435 1436 blk_put_request(rq); 1437} 1438 1439static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) 1440{ 1441 struct request_queue *q = dev->q; 1442 struct request *rq; 1443 struct bio *bio = rqd->bio; 1444 1445 rq = blk_mq_alloc_request(q, 1446 op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); 1447 if (IS_ERR(rq)) 1448 return -ENOMEM; 1449 1450 blk_init_request_from_bio(rq, bio); 1451 1452 rq->end_io_data = rqd; 1453 1454 blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io); 1455 1456 return 0; 1457} 1458 1459static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) 1460{ 1461 struct nullb *nullb = dev->q->queuedata; 1462 sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL; 1463 sector_t blksize; 1464 struct nvm_id_group *grp; 1465 1466 id->ver_id = 0x1; 1467 id->vmnt = 0; 1468 id->cap = 0x2; 1469 id->dom = 0x1; 1470 1471 id->ppaf.blk_offset = 0; 1472 id->ppaf.blk_len = 16; 1473 id->ppaf.pg_offset = 16; 1474 id->ppaf.pg_len = 16; 1475 id->ppaf.sect_offset = 32; 1476 id->ppaf.sect_len = 8; 1477 id->ppaf.pln_offset = 40; 1478 id->ppaf.pln_len = 8; 1479 id->ppaf.lun_offset = 48; 1480 id->ppaf.lun_len = 8; 1481 id->ppaf.ch_offset = 56; 1482 id->ppaf.ch_len = 8; 1483 1484 sector_div(size, nullb->dev->blocksize); /* convert size to pages */ 1485 size >>= 8; /* concert size to pgs pr blk */ 1486 grp = &id->grp; 1487 grp->mtype = 0; 1488 grp->fmtype = 0; 1489 grp->num_ch = 1; 1490 grp->num_pg = 256; 1491 blksize = size; 1492 size >>= 16; 1493 grp->num_lun = size + 1; 1494 sector_div(blksize, grp->num_lun); 1495 grp->num_blk = blksize; 1496 grp->num_pln = 1; 1497 1498 grp->fpg_sz = nullb->dev->blocksize; 1499 grp->csecs = nullb->dev->blocksize; 1500 grp->trdt = 25000; 1501 grp->trdm = 25000; 1502 grp->tprt = 500000; 1503 grp->tprm = 500000; 1504 grp->tbet = 1500000; 1505 grp->tbem = 1500000; 1506 grp->mpos = 0x010101; /* single plane rwe */ 1507 grp->cpar = nullb->dev->hw_queue_depth; 1508 1509 return 0; 1510} 1511 1512static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) 1513{ 1514 mempool_t *virtmem_pool; 1515 1516 virtmem_pool = mempool_create_slab_pool(64, ppa_cache); 1517 if (!virtmem_pool) { 1518 pr_err("null_blk: Unable to create virtual memory pool\n"); 1519 return NULL; 1520 } 1521 1522 return virtmem_pool; 1523} 1524 1525static void null_lnvm_destroy_dma_pool(void *pool) 1526{ 1527 mempool_destroy(pool); 1528} 1529 1530static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, 1531 gfp_t mem_flags, dma_addr_t *dma_handler) 1532{ 1533 return mempool_alloc(pool, mem_flags); 1534} 1535 1536static void null_lnvm_dev_dma_free(void *pool, void *entry, 1537 dma_addr_t dma_handler) 1538{ 1539 mempool_free(entry, pool); 1540} 1541 1542static struct nvm_dev_ops null_lnvm_dev_ops = { 1543 .identity = null_lnvm_id, 1544 .submit_io = null_lnvm_submit_io, 1545 1546 .create_dma_pool = null_lnvm_create_dma_pool, 1547 .destroy_dma_pool = null_lnvm_destroy_dma_pool, 1548 .dev_dma_alloc = null_lnvm_dev_dma_alloc, 1549 .dev_dma_free = null_lnvm_dev_dma_free, 1550 1551 /* Simulate nvme protocol restriction */ 1552 .max_phys_sect = 64, 1553}; 1554 1555static int null_nvm_register(struct nullb *nullb) 1556{ 1557 struct nvm_dev *dev; 1558 int rv; 1559 1560 dev = nvm_alloc_dev(0); 1561 if (!dev) 1562 return -ENOMEM; 1563 1564 dev->q = nullb->q; 1565 memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); 1566 dev->ops = &null_lnvm_dev_ops; 1567 1568 rv = nvm_register(dev); 1569 if (rv) { 1570 kfree(dev); 1571 return rv; 1572 } 1573 nullb->ndev = dev; 1574 return 0; 1575} 1576 1577static void null_nvm_unregister(struct nullb *nullb) 1578{ 1579 nvm_unregister(nullb->ndev); 1580} 1581#else 1582static int null_nvm_register(struct nullb *nullb) 1583{ 1584 pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n"); 1585 return -EINVAL; 1586} 1587static void null_nvm_unregister(struct nullb *nullb) {} 1588#endif /* CONFIG_NVM */ 1589 1590static void null_del_dev(struct nullb *nullb) 1591{ 1592 struct nullb_device *dev = nullb->dev; 1593 1594 ida_simple_remove(&nullb_indexes, nullb->index); 1595 1596 list_del_init(&nullb->list); 1597 1598 if (dev->use_lightnvm) 1599 null_nvm_unregister(nullb); 1600 else 1601 del_gendisk(nullb->disk); 1602 1603 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1604 hrtimer_cancel(&nullb->bw_timer); 1605 atomic_long_set(&nullb->cur_bytes, LONG_MAX); 1606 null_restart_queue_async(nullb); 1607 } 1608 1609 blk_cleanup_queue(nullb->q); 1610 if (dev->queue_mode == NULL_Q_MQ && 1611 nullb->tag_set == &nullb->__tag_set) 1612 blk_mq_free_tag_set(nullb->tag_set); 1613 if (!dev->use_lightnvm) 1614 put_disk(nullb->disk); 1615 cleanup_queues(nullb); 1616 if (null_cache_active(nullb)) 1617 null_free_device_storage(nullb->dev, true); 1618 kfree(nullb); 1619 dev->nullb = NULL; 1620} 1621 1622static void null_config_discard(struct nullb *nullb) 1623{ 1624 if (nullb->dev->discard == false) 1625 return; 1626 nullb->q->limits.discard_granularity = nullb->dev->blocksize; 1627 nullb->q->limits.discard_alignment = nullb->dev->blocksize; 1628 blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); 1629 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q); 1630} 1631 1632static int null_open(struct block_device *bdev, fmode_t mode) 1633{ 1634 return 0; 1635} 1636 1637static void null_release(struct gendisk *disk, fmode_t mode) 1638{ 1639} 1640 1641static const struct block_device_operations null_fops = { 1642 .owner = THIS_MODULE, 1643 .open = null_open, 1644 .release = null_release, 1645}; 1646 1647static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 1648{ 1649 BUG_ON(!nullb); 1650 BUG_ON(!nq); 1651 1652 init_waitqueue_head(&nq->wait); 1653 nq->queue_depth = nullb->queue_depth; 1654 nq->dev = nullb->dev; 1655} 1656 1657static void null_init_queues(struct nullb *nullb) 1658{ 1659 struct request_queue *q = nullb->q; 1660 struct blk_mq_hw_ctx *hctx; 1661 struct nullb_queue *nq; 1662 int i; 1663 1664 queue_for_each_hw_ctx(q, hctx, i) { 1665 if (!hctx->nr_ctx || !hctx->tags) 1666 continue; 1667 nq = &nullb->queues[i]; 1668 hctx->driver_data = nq; 1669 null_init_queue(nullb, nq); 1670 nullb->nr_queues++; 1671 } 1672} 1673 1674static int setup_commands(struct nullb_queue *nq) 1675{ 1676 struct nullb_cmd *cmd; 1677 int i, tag_size; 1678 1679 nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL); 1680 if (!nq->cmds) 1681 return -ENOMEM; 1682 1683 tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; 1684 nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL); 1685 if (!nq->tag_map) { 1686 kfree(nq->cmds); 1687 return -ENOMEM; 1688 } 1689 1690 for (i = 0; i < nq->queue_depth; i++) { 1691 cmd = &nq->cmds[i]; 1692 INIT_LIST_HEAD(&cmd->list); 1693 cmd->ll_list.next = NULL; 1694 cmd->tag = -1U; 1695 } 1696 1697 return 0; 1698} 1699 1700static int setup_queues(struct nullb *nullb) 1701{ 1702 nullb->queues = kzalloc(nullb->dev->submit_queues * 1703 sizeof(struct nullb_queue), GFP_KERNEL); 1704 if (!nullb->queues) 1705 return -ENOMEM; 1706 1707 nullb->nr_queues = 0; 1708 nullb->queue_depth = nullb->dev->hw_queue_depth; 1709 1710 return 0; 1711} 1712 1713static int init_driver_queues(struct nullb *nullb) 1714{ 1715 struct nullb_queue *nq; 1716 int i, ret = 0; 1717 1718 for (i = 0; i < nullb->dev->submit_queues; i++) { 1719 nq = &nullb->queues[i]; 1720 1721 null_init_queue(nullb, nq); 1722 1723 ret = setup_commands(nq); 1724 if (ret) 1725 return ret; 1726 nullb->nr_queues++; 1727 } 1728 return 0; 1729} 1730 1731static int null_gendisk_register(struct nullb *nullb) 1732{ 1733 struct gendisk *disk; 1734 sector_t size; 1735 1736 disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); 1737 if (!disk) 1738 return -ENOMEM; 1739 size = (sector_t)nullb->dev->size * 1024 * 1024ULL; 1740 set_capacity(disk, size >> 9); 1741 1742 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 1743 disk->major = null_major; 1744 disk->first_minor = nullb->index; 1745 disk->fops = &null_fops; 1746 disk->private_data = nullb; 1747 disk->queue = nullb->q; 1748 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1749 1750 add_disk(disk); 1751 return 0; 1752} 1753 1754static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1755{ 1756 set->ops = &null_mq_ops; 1757 set->nr_hw_queues = nullb ? nullb->dev->submit_queues : 1758 g_submit_queues; 1759 set->queue_depth = nullb ? nullb->dev->hw_queue_depth : 1760 g_hw_queue_depth; 1761 set->numa_node = nullb ? nullb->dev->home_node : g_home_node; 1762 set->cmd_size = sizeof(struct nullb_cmd); 1763 set->flags = BLK_MQ_F_SHOULD_MERGE; 1764 if (g_no_sched) 1765 set->flags |= BLK_MQ_F_NO_SCHED; 1766 set->driver_data = NULL; 1767 1768 if ((nullb && nullb->dev->blocking) || g_blocking) 1769 set->flags |= BLK_MQ_F_BLOCKING; 1770 1771 return blk_mq_alloc_tag_set(set); 1772} 1773 1774static void null_validate_conf(struct nullb_device *dev) 1775{ 1776 dev->blocksize = round_down(dev->blocksize, 512); 1777 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); 1778 if (dev->use_lightnvm && dev->blocksize != 4096) 1779 dev->blocksize = 4096; 1780 1781 if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ) 1782 dev->queue_mode = NULL_Q_MQ; 1783 1784 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 1785 if (dev->submit_queues != nr_online_nodes) 1786 dev->submit_queues = nr_online_nodes; 1787 } else if (dev->submit_queues > nr_cpu_ids) 1788 dev->submit_queues = nr_cpu_ids; 1789 else if (dev->submit_queues == 0) 1790 dev->submit_queues = 1; 1791 1792 dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); 1793 dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); 1794 1795 /* Do memory allocation, so set blocking */ 1796 if (dev->memory_backed) 1797 dev->blocking = true; 1798 else /* cache is meaningless */ 1799 dev->cache_size = 0; 1800 dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, 1801 dev->cache_size); 1802 dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); 1803 /* can not stop a queue */ 1804 if (dev->queue_mode == NULL_Q_BIO) 1805 dev->mbps = 0; 1806} 1807 1808static int null_add_dev(struct nullb_device *dev) 1809{ 1810 struct nullb *nullb; 1811 int rv; 1812 1813 null_validate_conf(dev); 1814 1815 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); 1816 if (!nullb) { 1817 rv = -ENOMEM; 1818 goto out; 1819 } 1820 nullb->dev = dev; 1821 dev->nullb = nullb; 1822 1823 spin_lock_init(&nullb->lock); 1824 1825 rv = setup_queues(nullb); 1826 if (rv) 1827 goto out_free_nullb; 1828 1829 if (dev->queue_mode == NULL_Q_MQ) { 1830 if (shared_tags) { 1831 nullb->tag_set = &tag_set; 1832 rv = 0; 1833 } else { 1834 nullb->tag_set = &nullb->__tag_set; 1835 rv = null_init_tag_set(nullb, nullb->tag_set); 1836 } 1837 1838 if (rv) 1839 goto out_cleanup_queues; 1840 1841 nullb->q = blk_mq_init_queue(nullb->tag_set); 1842 if (IS_ERR(nullb->q)) { 1843 rv = -ENOMEM; 1844 goto out_cleanup_tags; 1845 } 1846 null_init_queues(nullb); 1847 } else if (dev->queue_mode == NULL_Q_BIO) { 1848 nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); 1849 if (!nullb->q) { 1850 rv = -ENOMEM; 1851 goto out_cleanup_queues; 1852 } 1853 blk_queue_make_request(nullb->q, null_queue_bio); 1854 rv = init_driver_queues(nullb); 1855 if (rv) 1856 goto out_cleanup_blk_queue; 1857 } else { 1858 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, 1859 dev->home_node); 1860 if (!nullb->q) { 1861 rv = -ENOMEM; 1862 goto out_cleanup_queues; 1863 } 1864 blk_queue_prep_rq(nullb->q, null_rq_prep_fn); 1865 blk_queue_softirq_done(nullb->q, null_softirq_done_fn); 1866 rv = init_driver_queues(nullb); 1867 if (rv) 1868 goto out_cleanup_blk_queue; 1869 } 1870 1871 if (dev->mbps) { 1872 set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); 1873 nullb_setup_bwtimer(nullb); 1874 } 1875 1876 if (dev->cache_size > 0) { 1877 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 1878 blk_queue_write_cache(nullb->q, true, true); 1879 blk_queue_flush_queueable(nullb->q, true); 1880 } 1881 1882 nullb->q->queuedata = nullb; 1883 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); 1884 queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); 1885 1886 mutex_lock(&lock); 1887 nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 1888 dev->index = nullb->index; 1889 mutex_unlock(&lock); 1890 1891 blk_queue_logical_block_size(nullb->q, dev->blocksize); 1892 blk_queue_physical_block_size(nullb->q, dev->blocksize); 1893 1894 null_config_discard(nullb); 1895 1896 sprintf(nullb->disk_name, "nullb%d", nullb->index); 1897 1898 if (dev->use_lightnvm) 1899 rv = null_nvm_register(nullb); 1900 else 1901 rv = null_gendisk_register(nullb); 1902 1903 if (rv) 1904 goto out_cleanup_blk_queue; 1905 1906 mutex_lock(&lock); 1907 list_add_tail(&nullb->list, &nullb_list); 1908 mutex_unlock(&lock); 1909 1910 return 0; 1911out_cleanup_blk_queue: 1912 blk_cleanup_queue(nullb->q); 1913out_cleanup_tags: 1914 if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) 1915 blk_mq_free_tag_set(nullb->tag_set); 1916out_cleanup_queues: 1917 cleanup_queues(nullb); 1918out_free_nullb: 1919 kfree(nullb); 1920out: 1921 return rv; 1922} 1923 1924static int __init null_init(void) 1925{ 1926 int ret = 0; 1927 unsigned int i; 1928 struct nullb *nullb; 1929 struct nullb_device *dev; 1930 1931 /* check for nullb_page.bitmap */ 1932 if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT)) 1933 return -EINVAL; 1934 1935 if (g_bs > PAGE_SIZE) { 1936 pr_warn("null_blk: invalid block size\n"); 1937 pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); 1938 g_bs = PAGE_SIZE; 1939 } 1940 1941 if (g_use_lightnvm && g_bs != 4096) { 1942 pr_warn("null_blk: LightNVM only supports 4k block size\n"); 1943 pr_warn("null_blk: defaults block size to 4k\n"); 1944 g_bs = 4096; 1945 } 1946 1947 if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) { 1948 pr_warn("null_blk: LightNVM only supported for blk-mq\n"); 1949 pr_warn("null_blk: defaults queue mode to blk-mq\n"); 1950 g_queue_mode = NULL_Q_MQ; 1951 } 1952 1953 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 1954 if (g_submit_queues != nr_online_nodes) { 1955 pr_warn("null_blk: submit_queues param is set to %u.\n", 1956 nr_online_nodes); 1957 g_submit_queues = nr_online_nodes; 1958 } 1959 } else if (g_submit_queues > nr_cpu_ids) 1960 g_submit_queues = nr_cpu_ids; 1961 else if (g_submit_queues <= 0) 1962 g_submit_queues = 1; 1963 1964 if (g_queue_mode == NULL_Q_MQ && shared_tags) { 1965 ret = null_init_tag_set(NULL, &tag_set); 1966 if (ret) 1967 return ret; 1968 } 1969 1970 config_group_init(&nullb_subsys.su_group); 1971 mutex_init(&nullb_subsys.su_mutex); 1972 1973 ret = configfs_register_subsystem(&nullb_subsys); 1974 if (ret) 1975 goto err_tagset; 1976 1977 mutex_init(&lock); 1978 1979 null_major = register_blkdev(0, "nullb"); 1980 if (null_major < 0) { 1981 ret = null_major; 1982 goto err_conf; 1983 } 1984 1985 if (g_use_lightnvm) { 1986 ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), 1987 0, 0, NULL); 1988 if (!ppa_cache) { 1989 pr_err("null_blk: unable to create ppa cache\n"); 1990 ret = -ENOMEM; 1991 goto err_ppa; 1992 } 1993 } 1994 1995 for (i = 0; i < nr_devices; i++) { 1996 dev = null_alloc_dev(); 1997 if (!dev) { 1998 ret = -ENOMEM; 1999 goto err_dev; 2000 } 2001 ret = null_add_dev(dev); 2002 if (ret) { 2003 null_free_dev(dev); 2004 goto err_dev; 2005 } 2006 } 2007 2008 pr_info("null: module loaded\n"); 2009 return 0; 2010 2011err_dev: 2012 while (!list_empty(&nullb_list)) { 2013 nullb = list_entry(nullb_list.next, struct nullb, list); 2014 dev = nullb->dev; 2015 null_del_dev(nullb); 2016 null_free_dev(dev); 2017 } 2018 kmem_cache_destroy(ppa_cache); 2019err_ppa: 2020 unregister_blkdev(null_major, "nullb"); 2021err_conf: 2022 configfs_unregister_subsystem(&nullb_subsys); 2023err_tagset: 2024 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2025 blk_mq_free_tag_set(&tag_set); 2026 return ret; 2027} 2028 2029static void __exit null_exit(void) 2030{ 2031 struct nullb *nullb; 2032 2033 configfs_unregister_subsystem(&nullb_subsys); 2034 2035 unregister_blkdev(null_major, "nullb"); 2036 2037 mutex_lock(&lock); 2038 while (!list_empty(&nullb_list)) { 2039 struct nullb_device *dev; 2040 2041 nullb = list_entry(nullb_list.next, struct nullb, list); 2042 dev = nullb->dev; 2043 null_del_dev(nullb); 2044 null_free_dev(dev); 2045 } 2046 mutex_unlock(&lock); 2047 2048 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2049 blk_mq_free_tag_set(&tag_set); 2050 2051 kmem_cache_destroy(ppa_cache); 2052} 2053 2054module_init(null_init); 2055module_exit(null_exit); 2056 2057MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); 2058MODULE_LICENSE("GPL");