Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.20-rc6 1846 lines 44 kB view raw
1/* 2 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and 3 * Shaohua Li <shli@fb.com> 4 */ 5#include <linux/module.h> 6 7#include <linux/moduleparam.h> 8#include <linux/sched.h> 9#include <linux/fs.h> 10#include <linux/init.h> 11#include "null_blk.h" 12 13#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 14#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) 15#define SECTOR_MASK (PAGE_SECTORS - 1) 16 17#define FREE_BATCH 16 18 19#define TICKS_PER_SEC 50ULL 20#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) 21 22#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 23static DECLARE_FAULT_ATTR(null_timeout_attr); 24static DECLARE_FAULT_ATTR(null_requeue_attr); 25#endif 26 27static inline u64 mb_per_tick(int mbps) 28{ 29 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 30} 31 32/* 33 * Status flags for nullb_device. 34 * 35 * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. 36 * UP: Device is currently on and visible in userspace. 37 * THROTTLED: Device is being throttled. 38 * CACHE: Device is using a write-back cache. 39 */ 40enum nullb_device_flags { 41 NULLB_DEV_FL_CONFIGURED = 0, 42 NULLB_DEV_FL_UP = 1, 43 NULLB_DEV_FL_THROTTLED = 2, 44 NULLB_DEV_FL_CACHE = 3, 45}; 46 47#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) 48/* 49 * nullb_page is a page in memory for nullb devices. 50 * 51 * @page: The page holding the data. 52 * @bitmap: The bitmap represents which sector in the page has data. 53 * Each bit represents one block size. For example, sector 8 54 * will use the 7th bit 55 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache 56 * page is being flushing to storage. FREE means the cache page is freed and 57 * should be skipped from flushing to storage. Please see 58 * null_make_cache_space 59 */ 60struct nullb_page { 61 struct page *page; 62 DECLARE_BITMAP(bitmap, MAP_SZ); 63}; 64#define NULLB_PAGE_LOCK (MAP_SZ - 1) 65#define NULLB_PAGE_FREE (MAP_SZ - 2) 66 67static LIST_HEAD(nullb_list); 68static struct mutex lock; 69static int null_major; 70static DEFINE_IDA(nullb_indexes); 71static struct blk_mq_tag_set tag_set; 72 73enum { 74 NULL_IRQ_NONE = 0, 75 NULL_IRQ_SOFTIRQ = 1, 76 NULL_IRQ_TIMER = 2, 77}; 78 79enum { 80 NULL_Q_BIO = 0, 81 NULL_Q_RQ = 1, 82 NULL_Q_MQ = 2, 83}; 84 85static int g_no_sched; 86module_param_named(no_sched, g_no_sched, int, 0444); 87MODULE_PARM_DESC(no_sched, "No io scheduler"); 88 89static int g_submit_queues = 1; 90module_param_named(submit_queues, g_submit_queues, int, 0444); 91MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 92 93static int g_home_node = NUMA_NO_NODE; 94module_param_named(home_node, g_home_node, int, 0444); 95MODULE_PARM_DESC(home_node, "Home node for the device"); 96 97#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 98static char g_timeout_str[80]; 99module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); 100 101static char g_requeue_str[80]; 102module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); 103#endif 104 105static int g_queue_mode = NULL_Q_MQ; 106 107static int null_param_store_val(const char *str, int *val, int min, int max) 108{ 109 int ret, new_val; 110 111 ret = kstrtoint(str, 10, &new_val); 112 if (ret) 113 return -EINVAL; 114 115 if (new_val < min || new_val > max) 116 return -EINVAL; 117 118 *val = new_val; 119 return 0; 120} 121 122static int null_set_queue_mode(const char *str, const struct kernel_param *kp) 123{ 124 return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); 125} 126 127static const struct kernel_param_ops null_queue_mode_param_ops = { 128 .set = null_set_queue_mode, 129 .get = param_get_int, 130}; 131 132device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); 133MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); 134 135static int g_gb = 250; 136module_param_named(gb, g_gb, int, 0444); 137MODULE_PARM_DESC(gb, "Size in GB"); 138 139static int g_bs = 512; 140module_param_named(bs, g_bs, int, 0444); 141MODULE_PARM_DESC(bs, "Block size (in bytes)"); 142 143static int nr_devices = 1; 144module_param(nr_devices, int, 0444); 145MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 146 147static bool g_blocking; 148module_param_named(blocking, g_blocking, bool, 0444); 149MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 150 151static bool shared_tags; 152module_param(shared_tags, bool, 0444); 153MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); 154 155static int g_irqmode = NULL_IRQ_SOFTIRQ; 156 157static int null_set_irqmode(const char *str, const struct kernel_param *kp) 158{ 159 return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, 160 NULL_IRQ_TIMER); 161} 162 163static const struct kernel_param_ops null_irqmode_param_ops = { 164 .set = null_set_irqmode, 165 .get = param_get_int, 166}; 167 168device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); 169MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 170 171static unsigned long g_completion_nsec = 10000; 172module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); 173MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); 174 175static int g_hw_queue_depth = 64; 176module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); 177MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); 178 179static bool g_use_per_node_hctx; 180module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); 181MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 182 183static bool g_zoned; 184module_param_named(zoned, g_zoned, bool, S_IRUGO); 185MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); 186 187static unsigned long g_zone_size = 256; 188module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); 189MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); 190 191static struct nullb_device *null_alloc_dev(void); 192static void null_free_dev(struct nullb_device *dev); 193static void null_del_dev(struct nullb *nullb); 194static int null_add_dev(struct nullb_device *dev); 195static void null_free_device_storage(struct nullb_device *dev, bool is_cache); 196 197static inline struct nullb_device *to_nullb_device(struct config_item *item) 198{ 199 return item ? container_of(item, struct nullb_device, item) : NULL; 200} 201 202static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) 203{ 204 return snprintf(page, PAGE_SIZE, "%u\n", val); 205} 206 207static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, 208 char *page) 209{ 210 return snprintf(page, PAGE_SIZE, "%lu\n", val); 211} 212 213static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) 214{ 215 return snprintf(page, PAGE_SIZE, "%u\n", val); 216} 217 218static ssize_t nullb_device_uint_attr_store(unsigned int *val, 219 const char *page, size_t count) 220{ 221 unsigned int tmp; 222 int result; 223 224 result = kstrtouint(page, 0, &tmp); 225 if (result) 226 return result; 227 228 *val = tmp; 229 return count; 230} 231 232static ssize_t nullb_device_ulong_attr_store(unsigned long *val, 233 const char *page, size_t count) 234{ 235 int result; 236 unsigned long tmp; 237 238 result = kstrtoul(page, 0, &tmp); 239 if (result) 240 return result; 241 242 *val = tmp; 243 return count; 244} 245 246static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, 247 size_t count) 248{ 249 bool tmp; 250 int result; 251 252 result = kstrtobool(page, &tmp); 253 if (result) 254 return result; 255 256 *val = tmp; 257 return count; 258} 259 260/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ 261#define NULLB_DEVICE_ATTR(NAME, TYPE) \ 262static ssize_t \ 263nullb_device_##NAME##_show(struct config_item *item, char *page) \ 264{ \ 265 return nullb_device_##TYPE##_attr_show( \ 266 to_nullb_device(item)->NAME, page); \ 267} \ 268static ssize_t \ 269nullb_device_##NAME##_store(struct config_item *item, const char *page, \ 270 size_t count) \ 271{ \ 272 if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \ 273 return -EBUSY; \ 274 return nullb_device_##TYPE##_attr_store( \ 275 &to_nullb_device(item)->NAME, page, count); \ 276} \ 277CONFIGFS_ATTR(nullb_device_, NAME); 278 279NULLB_DEVICE_ATTR(size, ulong); 280NULLB_DEVICE_ATTR(completion_nsec, ulong); 281NULLB_DEVICE_ATTR(submit_queues, uint); 282NULLB_DEVICE_ATTR(home_node, uint); 283NULLB_DEVICE_ATTR(queue_mode, uint); 284NULLB_DEVICE_ATTR(blocksize, uint); 285NULLB_DEVICE_ATTR(irqmode, uint); 286NULLB_DEVICE_ATTR(hw_queue_depth, uint); 287NULLB_DEVICE_ATTR(index, uint); 288NULLB_DEVICE_ATTR(blocking, bool); 289NULLB_DEVICE_ATTR(use_per_node_hctx, bool); 290NULLB_DEVICE_ATTR(memory_backed, bool); 291NULLB_DEVICE_ATTR(discard, bool); 292NULLB_DEVICE_ATTR(mbps, uint); 293NULLB_DEVICE_ATTR(cache_size, ulong); 294NULLB_DEVICE_ATTR(zoned, bool); 295NULLB_DEVICE_ATTR(zone_size, ulong); 296 297static ssize_t nullb_device_power_show(struct config_item *item, char *page) 298{ 299 return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); 300} 301 302static ssize_t nullb_device_power_store(struct config_item *item, 303 const char *page, size_t count) 304{ 305 struct nullb_device *dev = to_nullb_device(item); 306 bool newp = false; 307 ssize_t ret; 308 309 ret = nullb_device_bool_attr_store(&newp, page, count); 310 if (ret < 0) 311 return ret; 312 313 if (!dev->power && newp) { 314 if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) 315 return count; 316 if (null_add_dev(dev)) { 317 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 318 return -ENOMEM; 319 } 320 321 set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 322 dev->power = newp; 323 } else if (dev->power && !newp) { 324 mutex_lock(&lock); 325 dev->power = newp; 326 null_del_dev(dev->nullb); 327 mutex_unlock(&lock); 328 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 329 clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 330 } 331 332 return count; 333} 334 335CONFIGFS_ATTR(nullb_device_, power); 336 337static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) 338{ 339 struct nullb_device *t_dev = to_nullb_device(item); 340 341 return badblocks_show(&t_dev->badblocks, page, 0); 342} 343 344static ssize_t nullb_device_badblocks_store(struct config_item *item, 345 const char *page, size_t count) 346{ 347 struct nullb_device *t_dev = to_nullb_device(item); 348 char *orig, *buf, *tmp; 349 u64 start, end; 350 int ret; 351 352 orig = kstrndup(page, count, GFP_KERNEL); 353 if (!orig) 354 return -ENOMEM; 355 356 buf = strstrip(orig); 357 358 ret = -EINVAL; 359 if (buf[0] != '+' && buf[0] != '-') 360 goto out; 361 tmp = strchr(&buf[1], '-'); 362 if (!tmp) 363 goto out; 364 *tmp = '\0'; 365 ret = kstrtoull(buf + 1, 0, &start); 366 if (ret) 367 goto out; 368 ret = kstrtoull(tmp + 1, 0, &end); 369 if (ret) 370 goto out; 371 ret = -EINVAL; 372 if (start > end) 373 goto out; 374 /* enable badblocks */ 375 cmpxchg(&t_dev->badblocks.shift, -1, 0); 376 if (buf[0] == '+') 377 ret = badblocks_set(&t_dev->badblocks, start, 378 end - start + 1, 1); 379 else 380 ret = badblocks_clear(&t_dev->badblocks, start, 381 end - start + 1); 382 if (ret == 0) 383 ret = count; 384out: 385 kfree(orig); 386 return ret; 387} 388CONFIGFS_ATTR(nullb_device_, badblocks); 389 390static struct configfs_attribute *nullb_device_attrs[] = { 391 &nullb_device_attr_size, 392 &nullb_device_attr_completion_nsec, 393 &nullb_device_attr_submit_queues, 394 &nullb_device_attr_home_node, 395 &nullb_device_attr_queue_mode, 396 &nullb_device_attr_blocksize, 397 &nullb_device_attr_irqmode, 398 &nullb_device_attr_hw_queue_depth, 399 &nullb_device_attr_index, 400 &nullb_device_attr_blocking, 401 &nullb_device_attr_use_per_node_hctx, 402 &nullb_device_attr_power, 403 &nullb_device_attr_memory_backed, 404 &nullb_device_attr_discard, 405 &nullb_device_attr_mbps, 406 &nullb_device_attr_cache_size, 407 &nullb_device_attr_badblocks, 408 &nullb_device_attr_zoned, 409 &nullb_device_attr_zone_size, 410 NULL, 411}; 412 413static void nullb_device_release(struct config_item *item) 414{ 415 struct nullb_device *dev = to_nullb_device(item); 416 417 null_free_device_storage(dev, false); 418 null_free_dev(dev); 419} 420 421static struct configfs_item_operations nullb_device_ops = { 422 .release = nullb_device_release, 423}; 424 425static const struct config_item_type nullb_device_type = { 426 .ct_item_ops = &nullb_device_ops, 427 .ct_attrs = nullb_device_attrs, 428 .ct_owner = THIS_MODULE, 429}; 430 431static struct 432config_item *nullb_group_make_item(struct config_group *group, const char *name) 433{ 434 struct nullb_device *dev; 435 436 dev = null_alloc_dev(); 437 if (!dev) 438 return ERR_PTR(-ENOMEM); 439 440 config_item_init_type_name(&dev->item, name, &nullb_device_type); 441 442 return &dev->item; 443} 444 445static void 446nullb_group_drop_item(struct config_group *group, struct config_item *item) 447{ 448 struct nullb_device *dev = to_nullb_device(item); 449 450 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 451 mutex_lock(&lock); 452 dev->power = false; 453 null_del_dev(dev->nullb); 454 mutex_unlock(&lock); 455 } 456 457 config_item_put(item); 458} 459 460static ssize_t memb_group_features_show(struct config_item *item, char *page) 461{ 462 return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n"); 463} 464 465CONFIGFS_ATTR_RO(memb_group_, features); 466 467static struct configfs_attribute *nullb_group_attrs[] = { 468 &memb_group_attr_features, 469 NULL, 470}; 471 472static struct configfs_group_operations nullb_group_ops = { 473 .make_item = nullb_group_make_item, 474 .drop_item = nullb_group_drop_item, 475}; 476 477static const struct config_item_type nullb_group_type = { 478 .ct_group_ops = &nullb_group_ops, 479 .ct_attrs = nullb_group_attrs, 480 .ct_owner = THIS_MODULE, 481}; 482 483static struct configfs_subsystem nullb_subsys = { 484 .su_group = { 485 .cg_item = { 486 .ci_namebuf = "nullb", 487 .ci_type = &nullb_group_type, 488 }, 489 }, 490}; 491 492static inline int null_cache_active(struct nullb *nullb) 493{ 494 return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 495} 496 497static struct nullb_device *null_alloc_dev(void) 498{ 499 struct nullb_device *dev; 500 501 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 502 if (!dev) 503 return NULL; 504 INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); 505 INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); 506 if (badblocks_init(&dev->badblocks, 0)) { 507 kfree(dev); 508 return NULL; 509 } 510 511 dev->size = g_gb * 1024; 512 dev->completion_nsec = g_completion_nsec; 513 dev->submit_queues = g_submit_queues; 514 dev->home_node = g_home_node; 515 dev->queue_mode = g_queue_mode; 516 dev->blocksize = g_bs; 517 dev->irqmode = g_irqmode; 518 dev->hw_queue_depth = g_hw_queue_depth; 519 dev->blocking = g_blocking; 520 dev->use_per_node_hctx = g_use_per_node_hctx; 521 dev->zoned = g_zoned; 522 dev->zone_size = g_zone_size; 523 return dev; 524} 525 526static void null_free_dev(struct nullb_device *dev) 527{ 528 if (!dev) 529 return; 530 531 null_zone_exit(dev); 532 badblocks_exit(&dev->badblocks); 533 kfree(dev); 534} 535 536static void put_tag(struct nullb_queue *nq, unsigned int tag) 537{ 538 clear_bit_unlock(tag, nq->tag_map); 539 540 if (waitqueue_active(&nq->wait)) 541 wake_up(&nq->wait); 542} 543 544static unsigned int get_tag(struct nullb_queue *nq) 545{ 546 unsigned int tag; 547 548 do { 549 tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 550 if (tag >= nq->queue_depth) 551 return -1U; 552 } while (test_and_set_bit_lock(tag, nq->tag_map)); 553 554 return tag; 555} 556 557static void free_cmd(struct nullb_cmd *cmd) 558{ 559 put_tag(cmd->nq, cmd->tag); 560} 561 562static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); 563 564static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 565{ 566 struct nullb_cmd *cmd; 567 unsigned int tag; 568 569 tag = get_tag(nq); 570 if (tag != -1U) { 571 cmd = &nq->cmds[tag]; 572 cmd->tag = tag; 573 cmd->nq = nq; 574 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 575 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, 576 HRTIMER_MODE_REL); 577 cmd->timer.function = null_cmd_timer_expired; 578 } 579 return cmd; 580 } 581 582 return NULL; 583} 584 585static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) 586{ 587 struct nullb_cmd *cmd; 588 DEFINE_WAIT(wait); 589 590 cmd = __alloc_cmd(nq); 591 if (cmd || !can_wait) 592 return cmd; 593 594 do { 595 prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 596 cmd = __alloc_cmd(nq); 597 if (cmd) 598 break; 599 600 io_schedule(); 601 } while (1); 602 603 finish_wait(&nq->wait, &wait); 604 return cmd; 605} 606 607static void end_cmd(struct nullb_cmd *cmd) 608{ 609 int queue_mode = cmd->nq->dev->queue_mode; 610 611 switch (queue_mode) { 612 case NULL_Q_MQ: 613 blk_mq_end_request(cmd->rq, cmd->error); 614 return; 615 case NULL_Q_BIO: 616 cmd->bio->bi_status = cmd->error; 617 bio_endio(cmd->bio); 618 break; 619 } 620 621 free_cmd(cmd); 622} 623 624static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 625{ 626 end_cmd(container_of(timer, struct nullb_cmd, timer)); 627 628 return HRTIMER_NORESTART; 629} 630 631static void null_cmd_end_timer(struct nullb_cmd *cmd) 632{ 633 ktime_t kt = cmd->nq->dev->completion_nsec; 634 635 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); 636} 637 638static void null_softirq_done_fn(struct request *rq) 639{ 640 struct nullb *nullb = rq->q->queuedata; 641 642 if (nullb->dev->queue_mode == NULL_Q_MQ) 643 end_cmd(blk_mq_rq_to_pdu(rq)); 644 else 645 end_cmd(rq->special); 646} 647 648static struct nullb_page *null_alloc_page(gfp_t gfp_flags) 649{ 650 struct nullb_page *t_page; 651 652 t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); 653 if (!t_page) 654 goto out; 655 656 t_page->page = alloc_pages(gfp_flags, 0); 657 if (!t_page->page) 658 goto out_freepage; 659 660 memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); 661 return t_page; 662out_freepage: 663 kfree(t_page); 664out: 665 return NULL; 666} 667 668static void null_free_page(struct nullb_page *t_page) 669{ 670 __set_bit(NULLB_PAGE_FREE, t_page->bitmap); 671 if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) 672 return; 673 __free_page(t_page->page); 674 kfree(t_page); 675} 676 677static bool null_page_empty(struct nullb_page *page) 678{ 679 int size = MAP_SZ - 2; 680 681 return find_first_bit(page->bitmap, size) == size; 682} 683 684static void null_free_sector(struct nullb *nullb, sector_t sector, 685 bool is_cache) 686{ 687 unsigned int sector_bit; 688 u64 idx; 689 struct nullb_page *t_page, *ret; 690 struct radix_tree_root *root; 691 692 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 693 idx = sector >> PAGE_SECTORS_SHIFT; 694 sector_bit = (sector & SECTOR_MASK); 695 696 t_page = radix_tree_lookup(root, idx); 697 if (t_page) { 698 __clear_bit(sector_bit, t_page->bitmap); 699 700 if (null_page_empty(t_page)) { 701 ret = radix_tree_delete_item(root, idx, t_page); 702 WARN_ON(ret != t_page); 703 null_free_page(ret); 704 if (is_cache) 705 nullb->dev->curr_cache -= PAGE_SIZE; 706 } 707 } 708} 709 710static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, 711 struct nullb_page *t_page, bool is_cache) 712{ 713 struct radix_tree_root *root; 714 715 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 716 717 if (radix_tree_insert(root, idx, t_page)) { 718 null_free_page(t_page); 719 t_page = radix_tree_lookup(root, idx); 720 WARN_ON(!t_page || t_page->page->index != idx); 721 } else if (is_cache) 722 nullb->dev->curr_cache += PAGE_SIZE; 723 724 return t_page; 725} 726 727static void null_free_device_storage(struct nullb_device *dev, bool is_cache) 728{ 729 unsigned long pos = 0; 730 int nr_pages; 731 struct nullb_page *ret, *t_pages[FREE_BATCH]; 732 struct radix_tree_root *root; 733 734 root = is_cache ? &dev->cache : &dev->data; 735 736 do { 737 int i; 738 739 nr_pages = radix_tree_gang_lookup(root, 740 (void **)t_pages, pos, FREE_BATCH); 741 742 for (i = 0; i < nr_pages; i++) { 743 pos = t_pages[i]->page->index; 744 ret = radix_tree_delete_item(root, pos, t_pages[i]); 745 WARN_ON(ret != t_pages[i]); 746 null_free_page(ret); 747 } 748 749 pos++; 750 } while (nr_pages == FREE_BATCH); 751 752 if (is_cache) 753 dev->curr_cache = 0; 754} 755 756static struct nullb_page *__null_lookup_page(struct nullb *nullb, 757 sector_t sector, bool for_write, bool is_cache) 758{ 759 unsigned int sector_bit; 760 u64 idx; 761 struct nullb_page *t_page; 762 struct radix_tree_root *root; 763 764 idx = sector >> PAGE_SECTORS_SHIFT; 765 sector_bit = (sector & SECTOR_MASK); 766 767 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 768 t_page = radix_tree_lookup(root, idx); 769 WARN_ON(t_page && t_page->page->index != idx); 770 771 if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) 772 return t_page; 773 774 return NULL; 775} 776 777static struct nullb_page *null_lookup_page(struct nullb *nullb, 778 sector_t sector, bool for_write, bool ignore_cache) 779{ 780 struct nullb_page *page = NULL; 781 782 if (!ignore_cache) 783 page = __null_lookup_page(nullb, sector, for_write, true); 784 if (page) 785 return page; 786 return __null_lookup_page(nullb, sector, for_write, false); 787} 788 789static struct nullb_page *null_insert_page(struct nullb *nullb, 790 sector_t sector, bool ignore_cache) 791 __releases(&nullb->lock) 792 __acquires(&nullb->lock) 793{ 794 u64 idx; 795 struct nullb_page *t_page; 796 797 t_page = null_lookup_page(nullb, sector, true, ignore_cache); 798 if (t_page) 799 return t_page; 800 801 spin_unlock_irq(&nullb->lock); 802 803 t_page = null_alloc_page(GFP_NOIO); 804 if (!t_page) 805 goto out_lock; 806 807 if (radix_tree_preload(GFP_NOIO)) 808 goto out_freepage; 809 810 spin_lock_irq(&nullb->lock); 811 idx = sector >> PAGE_SECTORS_SHIFT; 812 t_page->page->index = idx; 813 t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); 814 radix_tree_preload_end(); 815 816 return t_page; 817out_freepage: 818 null_free_page(t_page); 819out_lock: 820 spin_lock_irq(&nullb->lock); 821 return null_lookup_page(nullb, sector, true, ignore_cache); 822} 823 824static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) 825{ 826 int i; 827 unsigned int offset; 828 u64 idx; 829 struct nullb_page *t_page, *ret; 830 void *dst, *src; 831 832 idx = c_page->page->index; 833 834 t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); 835 836 __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); 837 if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { 838 null_free_page(c_page); 839 if (t_page && null_page_empty(t_page)) { 840 ret = radix_tree_delete_item(&nullb->dev->data, 841 idx, t_page); 842 null_free_page(t_page); 843 } 844 return 0; 845 } 846 847 if (!t_page) 848 return -ENOMEM; 849 850 src = kmap_atomic(c_page->page); 851 dst = kmap_atomic(t_page->page); 852 853 for (i = 0; i < PAGE_SECTORS; 854 i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { 855 if (test_bit(i, c_page->bitmap)) { 856 offset = (i << SECTOR_SHIFT); 857 memcpy(dst + offset, src + offset, 858 nullb->dev->blocksize); 859 __set_bit(i, t_page->bitmap); 860 } 861 } 862 863 kunmap_atomic(dst); 864 kunmap_atomic(src); 865 866 ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); 867 null_free_page(ret); 868 nullb->dev->curr_cache -= PAGE_SIZE; 869 870 return 0; 871} 872 873static int null_make_cache_space(struct nullb *nullb, unsigned long n) 874{ 875 int i, err, nr_pages; 876 struct nullb_page *c_pages[FREE_BATCH]; 877 unsigned long flushed = 0, one_round; 878 879again: 880 if ((nullb->dev->cache_size * 1024 * 1024) > 881 nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) 882 return 0; 883 884 nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, 885 (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); 886 /* 887 * nullb_flush_cache_page could unlock before using the c_pages. To 888 * avoid race, we don't allow page free 889 */ 890 for (i = 0; i < nr_pages; i++) { 891 nullb->cache_flush_pos = c_pages[i]->page->index; 892 /* 893 * We found the page which is being flushed to disk by other 894 * threads 895 */ 896 if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) 897 c_pages[i] = NULL; 898 else 899 __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); 900 } 901 902 one_round = 0; 903 for (i = 0; i < nr_pages; i++) { 904 if (c_pages[i] == NULL) 905 continue; 906 err = null_flush_cache_page(nullb, c_pages[i]); 907 if (err) 908 return err; 909 one_round++; 910 } 911 flushed += one_round << PAGE_SHIFT; 912 913 if (n > flushed) { 914 if (nr_pages == 0) 915 nullb->cache_flush_pos = 0; 916 if (one_round == 0) { 917 /* give other threads a chance */ 918 spin_unlock_irq(&nullb->lock); 919 spin_lock_irq(&nullb->lock); 920 } 921 goto again; 922 } 923 return 0; 924} 925 926static int copy_to_nullb(struct nullb *nullb, struct page *source, 927 unsigned int off, sector_t sector, size_t n, bool is_fua) 928{ 929 size_t temp, count = 0; 930 unsigned int offset; 931 struct nullb_page *t_page; 932 void *dst, *src; 933 934 while (count < n) { 935 temp = min_t(size_t, nullb->dev->blocksize, n - count); 936 937 if (null_cache_active(nullb) && !is_fua) 938 null_make_cache_space(nullb, PAGE_SIZE); 939 940 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 941 t_page = null_insert_page(nullb, sector, 942 !null_cache_active(nullb) || is_fua); 943 if (!t_page) 944 return -ENOSPC; 945 946 src = kmap_atomic(source); 947 dst = kmap_atomic(t_page->page); 948 memcpy(dst + offset, src + off + count, temp); 949 kunmap_atomic(dst); 950 kunmap_atomic(src); 951 952 __set_bit(sector & SECTOR_MASK, t_page->bitmap); 953 954 if (is_fua) 955 null_free_sector(nullb, sector, true); 956 957 count += temp; 958 sector += temp >> SECTOR_SHIFT; 959 } 960 return 0; 961} 962 963static int copy_from_nullb(struct nullb *nullb, struct page *dest, 964 unsigned int off, sector_t sector, size_t n) 965{ 966 size_t temp, count = 0; 967 unsigned int offset; 968 struct nullb_page *t_page; 969 void *dst, *src; 970 971 while (count < n) { 972 temp = min_t(size_t, nullb->dev->blocksize, n - count); 973 974 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 975 t_page = null_lookup_page(nullb, sector, false, 976 !null_cache_active(nullb)); 977 978 dst = kmap_atomic(dest); 979 if (!t_page) { 980 memset(dst + off + count, 0, temp); 981 goto next; 982 } 983 src = kmap_atomic(t_page->page); 984 memcpy(dst + off + count, src + offset, temp); 985 kunmap_atomic(src); 986next: 987 kunmap_atomic(dst); 988 989 count += temp; 990 sector += temp >> SECTOR_SHIFT; 991 } 992 return 0; 993} 994 995static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) 996{ 997 size_t temp; 998 999 spin_lock_irq(&nullb->lock); 1000 while (n > 0) { 1001 temp = min_t(size_t, n, nullb->dev->blocksize); 1002 null_free_sector(nullb, sector, false); 1003 if (null_cache_active(nullb)) 1004 null_free_sector(nullb, sector, true); 1005 sector += temp >> SECTOR_SHIFT; 1006 n -= temp; 1007 } 1008 spin_unlock_irq(&nullb->lock); 1009} 1010 1011static int null_handle_flush(struct nullb *nullb) 1012{ 1013 int err; 1014 1015 if (!null_cache_active(nullb)) 1016 return 0; 1017 1018 spin_lock_irq(&nullb->lock); 1019 while (true) { 1020 err = null_make_cache_space(nullb, 1021 nullb->dev->cache_size * 1024 * 1024); 1022 if (err || nullb->dev->curr_cache == 0) 1023 break; 1024 } 1025 1026 WARN_ON(!radix_tree_empty(&nullb->dev->cache)); 1027 spin_unlock_irq(&nullb->lock); 1028 return err; 1029} 1030 1031static int null_transfer(struct nullb *nullb, struct page *page, 1032 unsigned int len, unsigned int off, bool is_write, sector_t sector, 1033 bool is_fua) 1034{ 1035 int err = 0; 1036 1037 if (!is_write) { 1038 err = copy_from_nullb(nullb, page, off, sector, len); 1039 flush_dcache_page(page); 1040 } else { 1041 flush_dcache_page(page); 1042 err = copy_to_nullb(nullb, page, off, sector, len, is_fua); 1043 } 1044 1045 return err; 1046} 1047 1048static int null_handle_rq(struct nullb_cmd *cmd) 1049{ 1050 struct request *rq = cmd->rq; 1051 struct nullb *nullb = cmd->nq->dev->nullb; 1052 int err; 1053 unsigned int len; 1054 sector_t sector; 1055 struct req_iterator iter; 1056 struct bio_vec bvec; 1057 1058 sector = blk_rq_pos(rq); 1059 1060 if (req_op(rq) == REQ_OP_DISCARD) { 1061 null_handle_discard(nullb, sector, blk_rq_bytes(rq)); 1062 return 0; 1063 } 1064 1065 spin_lock_irq(&nullb->lock); 1066 rq_for_each_segment(bvec, rq, iter) { 1067 len = bvec.bv_len; 1068 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1069 op_is_write(req_op(rq)), sector, 1070 req_op(rq) & REQ_FUA); 1071 if (err) { 1072 spin_unlock_irq(&nullb->lock); 1073 return err; 1074 } 1075 sector += len >> SECTOR_SHIFT; 1076 } 1077 spin_unlock_irq(&nullb->lock); 1078 1079 return 0; 1080} 1081 1082static int null_handle_bio(struct nullb_cmd *cmd) 1083{ 1084 struct bio *bio = cmd->bio; 1085 struct nullb *nullb = cmd->nq->dev->nullb; 1086 int err; 1087 unsigned int len; 1088 sector_t sector; 1089 struct bio_vec bvec; 1090 struct bvec_iter iter; 1091 1092 sector = bio->bi_iter.bi_sector; 1093 1094 if (bio_op(bio) == REQ_OP_DISCARD) { 1095 null_handle_discard(nullb, sector, 1096 bio_sectors(bio) << SECTOR_SHIFT); 1097 return 0; 1098 } 1099 1100 spin_lock_irq(&nullb->lock); 1101 bio_for_each_segment(bvec, bio, iter) { 1102 len = bvec.bv_len; 1103 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1104 op_is_write(bio_op(bio)), sector, 1105 bio_op(bio) & REQ_FUA); 1106 if (err) { 1107 spin_unlock_irq(&nullb->lock); 1108 return err; 1109 } 1110 sector += len >> SECTOR_SHIFT; 1111 } 1112 spin_unlock_irq(&nullb->lock); 1113 return 0; 1114} 1115 1116static void null_stop_queue(struct nullb *nullb) 1117{ 1118 struct request_queue *q = nullb->q; 1119 1120 if (nullb->dev->queue_mode == NULL_Q_MQ) 1121 blk_mq_stop_hw_queues(q); 1122} 1123 1124static void null_restart_queue_async(struct nullb *nullb) 1125{ 1126 struct request_queue *q = nullb->q; 1127 1128 if (nullb->dev->queue_mode == NULL_Q_MQ) 1129 blk_mq_start_stopped_hw_queues(q, true); 1130} 1131 1132static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) 1133{ 1134 struct nullb_device *dev = cmd->nq->dev; 1135 struct nullb *nullb = dev->nullb; 1136 int err = 0; 1137 1138 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { 1139 struct request *rq = cmd->rq; 1140 1141 if (!hrtimer_active(&nullb->bw_timer)) 1142 hrtimer_restart(&nullb->bw_timer); 1143 1144 if (atomic_long_sub_return(blk_rq_bytes(rq), 1145 &nullb->cur_bytes) < 0) { 1146 null_stop_queue(nullb); 1147 /* race with timer */ 1148 if (atomic_long_read(&nullb->cur_bytes) > 0) 1149 null_restart_queue_async(nullb); 1150 /* requeue request */ 1151 return BLK_STS_DEV_RESOURCE; 1152 } 1153 } 1154 1155 if (nullb->dev->badblocks.shift != -1) { 1156 int bad_sectors; 1157 sector_t sector, size, first_bad; 1158 bool is_flush = true; 1159 1160 if (dev->queue_mode == NULL_Q_BIO && 1161 bio_op(cmd->bio) != REQ_OP_FLUSH) { 1162 is_flush = false; 1163 sector = cmd->bio->bi_iter.bi_sector; 1164 size = bio_sectors(cmd->bio); 1165 } 1166 if (dev->queue_mode != NULL_Q_BIO && 1167 req_op(cmd->rq) != REQ_OP_FLUSH) { 1168 is_flush = false; 1169 sector = blk_rq_pos(cmd->rq); 1170 size = blk_rq_sectors(cmd->rq); 1171 } 1172 if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector, 1173 size, &first_bad, &bad_sectors)) { 1174 cmd->error = BLK_STS_IOERR; 1175 goto out; 1176 } 1177 } 1178 1179 if (dev->memory_backed) { 1180 if (dev->queue_mode == NULL_Q_BIO) { 1181 if (bio_op(cmd->bio) == REQ_OP_FLUSH) 1182 err = null_handle_flush(nullb); 1183 else 1184 err = null_handle_bio(cmd); 1185 } else { 1186 if (req_op(cmd->rq) == REQ_OP_FLUSH) 1187 err = null_handle_flush(nullb); 1188 else 1189 err = null_handle_rq(cmd); 1190 } 1191 } 1192 cmd->error = errno_to_blk_status(err); 1193 1194 if (!cmd->error && dev->zoned) { 1195 sector_t sector; 1196 unsigned int nr_sectors; 1197 int op; 1198 1199 if (dev->queue_mode == NULL_Q_BIO) { 1200 op = bio_op(cmd->bio); 1201 sector = cmd->bio->bi_iter.bi_sector; 1202 nr_sectors = cmd->bio->bi_iter.bi_size >> 9; 1203 } else { 1204 op = req_op(cmd->rq); 1205 sector = blk_rq_pos(cmd->rq); 1206 nr_sectors = blk_rq_sectors(cmd->rq); 1207 } 1208 1209 if (op == REQ_OP_WRITE) 1210 null_zone_write(cmd, sector, nr_sectors); 1211 else if (op == REQ_OP_ZONE_RESET) 1212 null_zone_reset(cmd, sector); 1213 } 1214out: 1215 /* Complete IO by inline, softirq or timer */ 1216 switch (dev->irqmode) { 1217 case NULL_IRQ_SOFTIRQ: 1218 switch (dev->queue_mode) { 1219 case NULL_Q_MQ: 1220 blk_mq_complete_request(cmd->rq); 1221 break; 1222 case NULL_Q_BIO: 1223 /* 1224 * XXX: no proper submitting cpu information available. 1225 */ 1226 end_cmd(cmd); 1227 break; 1228 } 1229 break; 1230 case NULL_IRQ_NONE: 1231 end_cmd(cmd); 1232 break; 1233 case NULL_IRQ_TIMER: 1234 null_cmd_end_timer(cmd); 1235 break; 1236 } 1237 return BLK_STS_OK; 1238} 1239 1240static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) 1241{ 1242 struct nullb *nullb = container_of(timer, struct nullb, bw_timer); 1243 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1244 unsigned int mbps = nullb->dev->mbps; 1245 1246 if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) 1247 return HRTIMER_NORESTART; 1248 1249 atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); 1250 null_restart_queue_async(nullb); 1251 1252 hrtimer_forward_now(&nullb->bw_timer, timer_interval); 1253 1254 return HRTIMER_RESTART; 1255} 1256 1257static void nullb_setup_bwtimer(struct nullb *nullb) 1258{ 1259 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1260 1261 hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1262 nullb->bw_timer.function = nullb_bwtimer_fn; 1263 atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); 1264 hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); 1265} 1266 1267static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 1268{ 1269 int index = 0; 1270 1271 if (nullb->nr_queues != 1) 1272 index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 1273 1274 return &nullb->queues[index]; 1275} 1276 1277static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) 1278{ 1279 struct nullb *nullb = q->queuedata; 1280 struct nullb_queue *nq = nullb_to_queue(nullb); 1281 struct nullb_cmd *cmd; 1282 1283 cmd = alloc_cmd(nq, 1); 1284 cmd->bio = bio; 1285 1286 null_handle_cmd(cmd); 1287 return BLK_QC_T_NONE; 1288} 1289 1290static bool should_timeout_request(struct request *rq) 1291{ 1292#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1293 if (g_timeout_str[0]) 1294 return should_fail(&null_timeout_attr, 1); 1295#endif 1296 return false; 1297} 1298 1299static bool should_requeue_request(struct request *rq) 1300{ 1301#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1302 if (g_requeue_str[0]) 1303 return should_fail(&null_requeue_attr, 1); 1304#endif 1305 return false; 1306} 1307 1308static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) 1309{ 1310 pr_info("null: rq %p timed out\n", rq); 1311 blk_mq_complete_request(rq); 1312 return BLK_EH_DONE; 1313} 1314 1315static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, 1316 const struct blk_mq_queue_data *bd) 1317{ 1318 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 1319 struct nullb_queue *nq = hctx->driver_data; 1320 1321 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1322 1323 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 1324 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1325 cmd->timer.function = null_cmd_timer_expired; 1326 } 1327 cmd->rq = bd->rq; 1328 cmd->nq = nq; 1329 1330 blk_mq_start_request(bd->rq); 1331 1332 if (should_requeue_request(bd->rq)) { 1333 /* 1334 * Alternate between hitting the core BUSY path, and the 1335 * driver driven requeue path 1336 */ 1337 nq->requeue_selection++; 1338 if (nq->requeue_selection & 1) 1339 return BLK_STS_RESOURCE; 1340 else { 1341 blk_mq_requeue_request(bd->rq, true); 1342 return BLK_STS_OK; 1343 } 1344 } 1345 if (should_timeout_request(bd->rq)) 1346 return BLK_STS_OK; 1347 1348 return null_handle_cmd(cmd); 1349} 1350 1351static const struct blk_mq_ops null_mq_ops = { 1352 .queue_rq = null_queue_rq, 1353 .complete = null_softirq_done_fn, 1354 .timeout = null_timeout_rq, 1355}; 1356 1357static void cleanup_queue(struct nullb_queue *nq) 1358{ 1359 kfree(nq->tag_map); 1360 kfree(nq->cmds); 1361} 1362 1363static void cleanup_queues(struct nullb *nullb) 1364{ 1365 int i; 1366 1367 for (i = 0; i < nullb->nr_queues; i++) 1368 cleanup_queue(&nullb->queues[i]); 1369 1370 kfree(nullb->queues); 1371} 1372 1373static void null_del_dev(struct nullb *nullb) 1374{ 1375 struct nullb_device *dev = nullb->dev; 1376 1377 ida_simple_remove(&nullb_indexes, nullb->index); 1378 1379 list_del_init(&nullb->list); 1380 1381 del_gendisk(nullb->disk); 1382 1383 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1384 hrtimer_cancel(&nullb->bw_timer); 1385 atomic_long_set(&nullb->cur_bytes, LONG_MAX); 1386 null_restart_queue_async(nullb); 1387 } 1388 1389 blk_cleanup_queue(nullb->q); 1390 if (dev->queue_mode == NULL_Q_MQ && 1391 nullb->tag_set == &nullb->__tag_set) 1392 blk_mq_free_tag_set(nullb->tag_set); 1393 put_disk(nullb->disk); 1394 cleanup_queues(nullb); 1395 if (null_cache_active(nullb)) 1396 null_free_device_storage(nullb->dev, true); 1397 kfree(nullb); 1398 dev->nullb = NULL; 1399} 1400 1401static void null_config_discard(struct nullb *nullb) 1402{ 1403 if (nullb->dev->discard == false) 1404 return; 1405 nullb->q->limits.discard_granularity = nullb->dev->blocksize; 1406 nullb->q->limits.discard_alignment = nullb->dev->blocksize; 1407 blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); 1408 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); 1409} 1410 1411static int null_open(struct block_device *bdev, fmode_t mode) 1412{ 1413 return 0; 1414} 1415 1416static void null_release(struct gendisk *disk, fmode_t mode) 1417{ 1418} 1419 1420static const struct block_device_operations null_fops = { 1421 .owner = THIS_MODULE, 1422 .open = null_open, 1423 .release = null_release, 1424 .report_zones = null_zone_report, 1425}; 1426 1427static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 1428{ 1429 BUG_ON(!nullb); 1430 BUG_ON(!nq); 1431 1432 init_waitqueue_head(&nq->wait); 1433 nq->queue_depth = nullb->queue_depth; 1434 nq->dev = nullb->dev; 1435} 1436 1437static void null_init_queues(struct nullb *nullb) 1438{ 1439 struct request_queue *q = nullb->q; 1440 struct blk_mq_hw_ctx *hctx; 1441 struct nullb_queue *nq; 1442 int i; 1443 1444 queue_for_each_hw_ctx(q, hctx, i) { 1445 if (!hctx->nr_ctx || !hctx->tags) 1446 continue; 1447 nq = &nullb->queues[i]; 1448 hctx->driver_data = nq; 1449 null_init_queue(nullb, nq); 1450 nullb->nr_queues++; 1451 } 1452} 1453 1454static int setup_commands(struct nullb_queue *nq) 1455{ 1456 struct nullb_cmd *cmd; 1457 int i, tag_size; 1458 1459 nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); 1460 if (!nq->cmds) 1461 return -ENOMEM; 1462 1463 tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; 1464 nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); 1465 if (!nq->tag_map) { 1466 kfree(nq->cmds); 1467 return -ENOMEM; 1468 } 1469 1470 for (i = 0; i < nq->queue_depth; i++) { 1471 cmd = &nq->cmds[i]; 1472 INIT_LIST_HEAD(&cmd->list); 1473 cmd->ll_list.next = NULL; 1474 cmd->tag = -1U; 1475 } 1476 1477 return 0; 1478} 1479 1480static int setup_queues(struct nullb *nullb) 1481{ 1482 nullb->queues = kcalloc(nullb->dev->submit_queues, 1483 sizeof(struct nullb_queue), 1484 GFP_KERNEL); 1485 if (!nullb->queues) 1486 return -ENOMEM; 1487 1488 nullb->nr_queues = 0; 1489 nullb->queue_depth = nullb->dev->hw_queue_depth; 1490 1491 return 0; 1492} 1493 1494static int init_driver_queues(struct nullb *nullb) 1495{ 1496 struct nullb_queue *nq; 1497 int i, ret = 0; 1498 1499 for (i = 0; i < nullb->dev->submit_queues; i++) { 1500 nq = &nullb->queues[i]; 1501 1502 null_init_queue(nullb, nq); 1503 1504 ret = setup_commands(nq); 1505 if (ret) 1506 return ret; 1507 nullb->nr_queues++; 1508 } 1509 return 0; 1510} 1511 1512static int null_gendisk_register(struct nullb *nullb) 1513{ 1514 struct gendisk *disk; 1515 sector_t size; 1516 1517 disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); 1518 if (!disk) 1519 return -ENOMEM; 1520 size = (sector_t)nullb->dev->size * 1024 * 1024ULL; 1521 set_capacity(disk, size >> 9); 1522 1523 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 1524 disk->major = null_major; 1525 disk->first_minor = nullb->index; 1526 disk->fops = &null_fops; 1527 disk->private_data = nullb; 1528 disk->queue = nullb->q; 1529 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1530 1531 if (nullb->dev->zoned) { 1532 int ret = blk_revalidate_disk_zones(disk); 1533 1534 if (ret != 0) 1535 return ret; 1536 } 1537 1538 add_disk(disk); 1539 return 0; 1540} 1541 1542static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1543{ 1544 set->ops = &null_mq_ops; 1545 set->nr_hw_queues = nullb ? nullb->dev->submit_queues : 1546 g_submit_queues; 1547 set->queue_depth = nullb ? nullb->dev->hw_queue_depth : 1548 g_hw_queue_depth; 1549 set->numa_node = nullb ? nullb->dev->home_node : g_home_node; 1550 set->cmd_size = sizeof(struct nullb_cmd); 1551 set->flags = BLK_MQ_F_SHOULD_MERGE; 1552 if (g_no_sched) 1553 set->flags |= BLK_MQ_F_NO_SCHED; 1554 set->driver_data = NULL; 1555 1556 if ((nullb && nullb->dev->blocking) || g_blocking) 1557 set->flags |= BLK_MQ_F_BLOCKING; 1558 1559 return blk_mq_alloc_tag_set(set); 1560} 1561 1562static void null_validate_conf(struct nullb_device *dev) 1563{ 1564 dev->blocksize = round_down(dev->blocksize, 512); 1565 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); 1566 1567 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 1568 if (dev->submit_queues != nr_online_nodes) 1569 dev->submit_queues = nr_online_nodes; 1570 } else if (dev->submit_queues > nr_cpu_ids) 1571 dev->submit_queues = nr_cpu_ids; 1572 else if (dev->submit_queues == 0) 1573 dev->submit_queues = 1; 1574 1575 dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); 1576 dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); 1577 1578 /* Do memory allocation, so set blocking */ 1579 if (dev->memory_backed) 1580 dev->blocking = true; 1581 else /* cache is meaningless */ 1582 dev->cache_size = 0; 1583 dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, 1584 dev->cache_size); 1585 dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); 1586 /* can not stop a queue */ 1587 if (dev->queue_mode == NULL_Q_BIO) 1588 dev->mbps = 0; 1589} 1590 1591#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1592static bool __null_setup_fault(struct fault_attr *attr, char *str) 1593{ 1594 if (!str[0]) 1595 return true; 1596 1597 if (!setup_fault_attr(attr, str)) 1598 return false; 1599 1600 attr->verbose = 0; 1601 return true; 1602} 1603#endif 1604 1605static bool null_setup_fault(void) 1606{ 1607#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1608 if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) 1609 return false; 1610 if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) 1611 return false; 1612#endif 1613 return true; 1614} 1615 1616static int null_add_dev(struct nullb_device *dev) 1617{ 1618 struct nullb *nullb; 1619 int rv; 1620 1621 null_validate_conf(dev); 1622 1623 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); 1624 if (!nullb) { 1625 rv = -ENOMEM; 1626 goto out; 1627 } 1628 nullb->dev = dev; 1629 dev->nullb = nullb; 1630 1631 spin_lock_init(&nullb->lock); 1632 1633 rv = setup_queues(nullb); 1634 if (rv) 1635 goto out_free_nullb; 1636 1637 if (dev->queue_mode == NULL_Q_MQ) { 1638 if (shared_tags) { 1639 nullb->tag_set = &tag_set; 1640 rv = 0; 1641 } else { 1642 nullb->tag_set = &nullb->__tag_set; 1643 rv = null_init_tag_set(nullb, nullb->tag_set); 1644 } 1645 1646 if (rv) 1647 goto out_cleanup_queues; 1648 1649 if (!null_setup_fault()) 1650 goto out_cleanup_queues; 1651 1652 nullb->tag_set->timeout = 5 * HZ; 1653 nullb->q = blk_mq_init_queue(nullb->tag_set); 1654 if (IS_ERR(nullb->q)) { 1655 rv = -ENOMEM; 1656 goto out_cleanup_tags; 1657 } 1658 null_init_queues(nullb); 1659 } else if (dev->queue_mode == NULL_Q_BIO) { 1660 nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node, 1661 NULL); 1662 if (!nullb->q) { 1663 rv = -ENOMEM; 1664 goto out_cleanup_queues; 1665 } 1666 blk_queue_make_request(nullb->q, null_queue_bio); 1667 rv = init_driver_queues(nullb); 1668 if (rv) 1669 goto out_cleanup_blk_queue; 1670 } 1671 1672 if (dev->mbps) { 1673 set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); 1674 nullb_setup_bwtimer(nullb); 1675 } 1676 1677 if (dev->cache_size > 0) { 1678 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 1679 blk_queue_write_cache(nullb->q, true, true); 1680 blk_queue_flush_queueable(nullb->q, true); 1681 } 1682 1683 if (dev->zoned) { 1684 rv = null_zone_init(dev); 1685 if (rv) 1686 goto out_cleanup_blk_queue; 1687 1688 blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); 1689 nullb->q->limits.zoned = BLK_ZONED_HM; 1690 } 1691 1692 nullb->q->queuedata = nullb; 1693 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); 1694 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); 1695 1696 mutex_lock(&lock); 1697 nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 1698 dev->index = nullb->index; 1699 mutex_unlock(&lock); 1700 1701 blk_queue_logical_block_size(nullb->q, dev->blocksize); 1702 blk_queue_physical_block_size(nullb->q, dev->blocksize); 1703 1704 null_config_discard(nullb); 1705 1706 sprintf(nullb->disk_name, "nullb%d", nullb->index); 1707 1708 rv = null_gendisk_register(nullb); 1709 if (rv) 1710 goto out_cleanup_zone; 1711 1712 mutex_lock(&lock); 1713 list_add_tail(&nullb->list, &nullb_list); 1714 mutex_unlock(&lock); 1715 1716 return 0; 1717out_cleanup_zone: 1718 if (dev->zoned) 1719 null_zone_exit(dev); 1720out_cleanup_blk_queue: 1721 blk_cleanup_queue(nullb->q); 1722out_cleanup_tags: 1723 if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) 1724 blk_mq_free_tag_set(nullb->tag_set); 1725out_cleanup_queues: 1726 cleanup_queues(nullb); 1727out_free_nullb: 1728 kfree(nullb); 1729out: 1730 return rv; 1731} 1732 1733static int __init null_init(void) 1734{ 1735 int ret = 0; 1736 unsigned int i; 1737 struct nullb *nullb; 1738 struct nullb_device *dev; 1739 1740 if (g_bs > PAGE_SIZE) { 1741 pr_warn("null_blk: invalid block size\n"); 1742 pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); 1743 g_bs = PAGE_SIZE; 1744 } 1745 1746 if (!is_power_of_2(g_zone_size)) { 1747 pr_err("null_blk: zone_size must be power-of-two\n"); 1748 return -EINVAL; 1749 } 1750 1751 if (g_queue_mode == NULL_Q_RQ) { 1752 pr_err("null_blk: legacy IO path no longer available\n"); 1753 return -EINVAL; 1754 } 1755 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 1756 if (g_submit_queues != nr_online_nodes) { 1757 pr_warn("null_blk: submit_queues param is set to %u.\n", 1758 nr_online_nodes); 1759 g_submit_queues = nr_online_nodes; 1760 } 1761 } else if (g_submit_queues > nr_cpu_ids) 1762 g_submit_queues = nr_cpu_ids; 1763 else if (g_submit_queues <= 0) 1764 g_submit_queues = 1; 1765 1766 if (g_queue_mode == NULL_Q_MQ && shared_tags) { 1767 ret = null_init_tag_set(NULL, &tag_set); 1768 if (ret) 1769 return ret; 1770 } 1771 1772 config_group_init(&nullb_subsys.su_group); 1773 mutex_init(&nullb_subsys.su_mutex); 1774 1775 ret = configfs_register_subsystem(&nullb_subsys); 1776 if (ret) 1777 goto err_tagset; 1778 1779 mutex_init(&lock); 1780 1781 null_major = register_blkdev(0, "nullb"); 1782 if (null_major < 0) { 1783 ret = null_major; 1784 goto err_conf; 1785 } 1786 1787 for (i = 0; i < nr_devices; i++) { 1788 dev = null_alloc_dev(); 1789 if (!dev) { 1790 ret = -ENOMEM; 1791 goto err_dev; 1792 } 1793 ret = null_add_dev(dev); 1794 if (ret) { 1795 null_free_dev(dev); 1796 goto err_dev; 1797 } 1798 } 1799 1800 pr_info("null: module loaded\n"); 1801 return 0; 1802 1803err_dev: 1804 while (!list_empty(&nullb_list)) { 1805 nullb = list_entry(nullb_list.next, struct nullb, list); 1806 dev = nullb->dev; 1807 null_del_dev(nullb); 1808 null_free_dev(dev); 1809 } 1810 unregister_blkdev(null_major, "nullb"); 1811err_conf: 1812 configfs_unregister_subsystem(&nullb_subsys); 1813err_tagset: 1814 if (g_queue_mode == NULL_Q_MQ && shared_tags) 1815 blk_mq_free_tag_set(&tag_set); 1816 return ret; 1817} 1818 1819static void __exit null_exit(void) 1820{ 1821 struct nullb *nullb; 1822 1823 configfs_unregister_subsystem(&nullb_subsys); 1824 1825 unregister_blkdev(null_major, "nullb"); 1826 1827 mutex_lock(&lock); 1828 while (!list_empty(&nullb_list)) { 1829 struct nullb_device *dev; 1830 1831 nullb = list_entry(nullb_list.next, struct nullb, list); 1832 dev = nullb->dev; 1833 null_del_dev(nullb); 1834 null_free_dev(dev); 1835 } 1836 mutex_unlock(&lock); 1837 1838 if (g_queue_mode == NULL_Q_MQ && shared_tags) 1839 blk_mq_free_tag_set(&tag_set); 1840} 1841 1842module_init(null_init); 1843module_exit(null_exit); 1844 1845MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); 1846MODULE_LICENSE("GPL");