Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: simplify bdev/disk lookup in blkdev_get

To simplify block device lookup and a few other upcoming areas, make sure
that we always have a struct block_device available for each disk and
each partition, and only find existing block devices in bdget. The only
downside of this is that each device and partition uses a little more
memory. The upside will be that a lot of code can be simplified.

With that all we need to look up the block device is to lookup the inode
and do a few sanity checks on the gendisk, instead of the separate lookup
for the gendisk. For blk-cgroup which wants to access a gendisk without
opening it, a new blkdev_{get,put}_no_open low-level interface is added
to replace the previous get_gendisk use.

Note that the change to look up block device directly instead of the two
step lookup using struct gendisk causes a subtile change in behavior:
accessing a non-existing partition on an existing block device can now
cause a call to request_module. That call is harmless, and in practice
no recent system will access these nodes as they aren't created by udev
and static /dev/ setups are unusual.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Christoph Hellwig and committed by
Jens Axboe
22ae8ce8 4e7b5671

+195 -320
+21 -21
block/blk-cgroup.c
··· 556 556 } 557 557 558 558 /** 559 - * blkg_conf_prep - parse and prepare for per-blkg config update 559 + * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update 560 560 * @inputp: input string pointer 561 561 * 562 562 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update 563 - * from @input and get and return the matching gendisk. *@inputp is 563 + * from @input and get and return the matching bdev. *@inputp is 564 564 * updated to point past the device node prefix. Returns an ERR_PTR() 565 565 * value on error. 566 566 * 567 567 * Use this function iff blkg_conf_prep() can't be used for some reason. 568 568 */ 569 - struct gendisk *blkcg_conf_get_disk(char **inputp) 569 + struct block_device *blkcg_conf_open_bdev(char **inputp) 570 570 { 571 571 char *input = *inputp; 572 572 unsigned int major, minor; 573 - struct gendisk *disk; 574 - int key_len, part; 573 + struct block_device *bdev; 574 + int key_len; 575 575 576 576 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) 577 577 return ERR_PTR(-EINVAL); ··· 581 581 return ERR_PTR(-EINVAL); 582 582 input = skip_spaces(input); 583 583 584 - disk = get_gendisk(MKDEV(major, minor), &part); 585 - if (!disk) 584 + bdev = blkdev_get_no_open(MKDEV(major, minor)); 585 + if (!bdev) 586 586 return ERR_PTR(-ENODEV); 587 - if (part) { 588 - put_disk_and_module(disk); 587 + if (bdev_is_partition(bdev)) { 588 + blkdev_put_no_open(bdev); 589 589 return ERR_PTR(-ENODEV); 590 590 } 591 591 592 592 *inputp = input; 593 - return disk; 593 + return bdev; 594 594 } 595 595 596 596 /** ··· 607 607 */ 608 608 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 609 609 char *input, struct blkg_conf_ctx *ctx) 610 - __acquires(rcu) __acquires(&disk->queue->queue_lock) 610 + __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) 611 611 { 612 - struct gendisk *disk; 612 + struct block_device *bdev; 613 613 struct request_queue *q; 614 614 struct blkcg_gq *blkg; 615 615 int ret; 616 616 617 - disk = blkcg_conf_get_disk(&input); 618 - if (IS_ERR(disk)) 619 - return PTR_ERR(disk); 617 + bdev = blkcg_conf_open_bdev(&input); 618 + if (IS_ERR(bdev)) 619 + return PTR_ERR(bdev); 620 620 621 - q = disk->queue; 621 + q = bdev->bd_disk->queue; 622 622 623 623 rcu_read_lock(); 624 624 spin_lock_irq(&q->queue_lock); ··· 689 689 goto success; 690 690 } 691 691 success: 692 - ctx->disk = disk; 692 + ctx->bdev = bdev; 693 693 ctx->blkg = blkg; 694 694 ctx->body = input; 695 695 return 0; ··· 700 700 spin_unlock_irq(&q->queue_lock); 701 701 rcu_read_unlock(); 702 702 fail: 703 - put_disk_and_module(disk); 703 + blkdev_put_no_open(bdev); 704 704 /* 705 705 * If queue was bypassing, we should retry. Do so after a 706 706 * short msleep(). It isn't strictly necessary but queue ··· 723 723 * with blkg_conf_prep(). 724 724 */ 725 725 void blkg_conf_finish(struct blkg_conf_ctx *ctx) 726 - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) 726 + __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) 727 727 { 728 - spin_unlock_irq(&ctx->disk->queue->queue_lock); 728 + spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); 729 729 rcu_read_unlock(); 730 - put_disk_and_module(ctx->disk); 730 + blkdev_put_no_open(ctx->bdev); 731 731 } 732 732 EXPORT_SYMBOL_GPL(blkg_conf_finish); 733 733
+18 -18
block/blk-iocost.c
··· 3120 3120 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, 3121 3121 size_t nbytes, loff_t off) 3122 3122 { 3123 - struct gendisk *disk; 3123 + struct block_device *bdev; 3124 3124 struct ioc *ioc; 3125 3125 u32 qos[NR_QOS_PARAMS]; 3126 3126 bool enable, user; 3127 3127 char *p; 3128 3128 int ret; 3129 3129 3130 - disk = blkcg_conf_get_disk(&input); 3131 - if (IS_ERR(disk)) 3132 - return PTR_ERR(disk); 3130 + bdev = blkcg_conf_open_bdev(&input); 3131 + if (IS_ERR(bdev)) 3132 + return PTR_ERR(bdev); 3133 3133 3134 - ioc = q_to_ioc(disk->queue); 3134 + ioc = q_to_ioc(bdev->bd_disk->queue); 3135 3135 if (!ioc) { 3136 - ret = blk_iocost_init(disk->queue); 3136 + ret = blk_iocost_init(bdev->bd_disk->queue); 3137 3137 if (ret) 3138 3138 goto err; 3139 - ioc = q_to_ioc(disk->queue); 3139 + ioc = q_to_ioc(bdev->bd_disk->queue); 3140 3140 } 3141 3141 3142 3142 spin_lock_irq(&ioc->lock); ··· 3231 3231 ioc_refresh_params(ioc, true); 3232 3232 spin_unlock_irq(&ioc->lock); 3233 3233 3234 - put_disk_and_module(disk); 3234 + blkdev_put_no_open(bdev); 3235 3235 return nbytes; 3236 3236 einval: 3237 3237 ret = -EINVAL; 3238 3238 err: 3239 - put_disk_and_module(disk); 3239 + blkdev_put_no_open(bdev); 3240 3240 return ret; 3241 3241 } 3242 3242 ··· 3287 3287 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, 3288 3288 size_t nbytes, loff_t off) 3289 3289 { 3290 - struct gendisk *disk; 3290 + struct block_device *bdev; 3291 3291 struct ioc *ioc; 3292 3292 u64 u[NR_I_LCOEFS]; 3293 3293 bool user; 3294 3294 char *p; 3295 3295 int ret; 3296 3296 3297 - disk = blkcg_conf_get_disk(&input); 3298 - if (IS_ERR(disk)) 3299 - return PTR_ERR(disk); 3297 + bdev = blkcg_conf_open_bdev(&input); 3298 + if (IS_ERR(bdev)) 3299 + return PTR_ERR(bdev); 3300 3300 3301 - ioc = q_to_ioc(disk->queue); 3301 + ioc = q_to_ioc(bdev->bd_disk->queue); 3302 3302 if (!ioc) { 3303 - ret = blk_iocost_init(disk->queue); 3303 + ret = blk_iocost_init(bdev->bd_disk->queue); 3304 3304 if (ret) 3305 3305 goto err; 3306 - ioc = q_to_ioc(disk->queue); 3306 + ioc = q_to_ioc(bdev->bd_disk->queue); 3307 3307 } 3308 3308 3309 3309 spin_lock_irq(&ioc->lock); ··· 3356 3356 ioc_refresh_params(ioc, true); 3357 3357 spin_unlock_irq(&ioc->lock); 3358 3358 3359 - put_disk_and_module(disk); 3359 + blkdev_put_no_open(bdev); 3360 3360 return nbytes; 3361 3361 3362 3362 einval: 3363 3363 ret = -EINVAL; 3364 3364 err: 3365 - put_disk_and_module(disk); 3365 + blkdev_put_no_open(bdev); 3366 3366 return ret; 3367 3367 } 3368 3368
+1 -1
block/blk.h
··· 352 352 353 353 int blk_alloc_devt(struct hd_struct *part, dev_t *devt); 354 354 void blk_free_devt(dev_t devt); 355 - void blk_invalidate_devt(dev_t devt); 356 355 char *disk_name(struct gendisk *hd, int partno, char *buf); 357 356 #define ADDPART_FLAG_NONE 0 358 357 #define ADDPART_FLAG_RAID 1 ··· 383 384 { 384 385 free_percpu(part->dkstats); 385 386 kfree(part->info); 387 + bdput(part->bdev); 386 388 percpu_ref_exit(&part->ref); 387 389 } 388 390
+25 -187
block/genhd.c
··· 27 27 28 28 static struct kobject *block_depr; 29 29 30 - static DEFINE_XARRAY(bdev_map); 31 - static DEFINE_MUTEX(bdev_map_lock); 30 + DECLARE_RWSEM(bdev_lookup_sem); 32 31 33 32 /* for extended dynamic devt allocation, currently only one major is used */ 34 33 #define NR_EXT_DEVT (1 << MINORBITS) 35 - 36 - /* For extended devt allocation. ext_devt_lock prevents look up 37 - * results from going away underneath its user. 38 - */ 39 - static DEFINE_SPINLOCK(ext_devt_lock); 40 - static DEFINE_IDR(ext_devt_idr); 34 + static DEFINE_IDA(ext_devt_ida); 41 35 42 36 static void disk_check_events(struct disk_events *ev, 43 37 unsigned int *clearing_ptr); ··· 574 580 return 0; 575 581 } 576 582 577 - /* allocate ext devt */ 578 - idr_preload(GFP_KERNEL); 579 - 580 - spin_lock_bh(&ext_devt_lock); 581 - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); 582 - spin_unlock_bh(&ext_devt_lock); 583 - 584 - idr_preload_end(); 583 + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); 585 584 if (idx < 0) 586 585 return idx == -ENOSPC ? -EBUSY : idx; 587 586 ··· 593 606 */ 594 607 void blk_free_devt(dev_t devt) 595 608 { 596 - if (devt == MKDEV(0, 0)) 597 - return; 598 - 599 - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { 600 - spin_lock_bh(&ext_devt_lock); 601 - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); 602 - spin_unlock_bh(&ext_devt_lock); 603 - } 604 - } 605 - 606 - /* 607 - * We invalidate devt by assigning NULL pointer for devt in idr. 608 - */ 609 - void blk_invalidate_devt(dev_t devt) 610 - { 611 - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { 612 - spin_lock_bh(&ext_devt_lock); 613 - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); 614 - spin_unlock_bh(&ext_devt_lock); 615 - } 609 + if (MAJOR(devt) == BLOCK_EXT_MAJOR) 610 + ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); 616 611 } 617 612 618 613 static char *bdevt_str(dev_t devt, char *buf) ··· 607 638 snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); 608 639 609 640 return buf; 610 - } 611 - 612 - static void blk_register_region(struct gendisk *disk) 613 - { 614 - int i; 615 - 616 - mutex_lock(&bdev_map_lock); 617 - for (i = 0; i < disk->minors; i++) { 618 - if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) 619 - WARN_ON_ONCE(1); 620 - } 621 - mutex_unlock(&bdev_map_lock); 622 - } 623 - 624 - static void blk_unregister_region(struct gendisk *disk) 625 - { 626 - int i; 627 - 628 - mutex_lock(&bdev_map_lock); 629 - for (i = 0; i < disk->minors; i++) 630 - xa_erase(&bdev_map, disk_devt(disk) + i); 631 - mutex_unlock(&bdev_map_lock); 632 641 } 633 642 634 643 static void disk_scan_partitions(struct gendisk *disk) ··· 752 805 ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); 753 806 WARN_ON(ret); 754 807 bdi_set_owner(bdi, dev); 755 - blk_register_region(disk); 808 + bdev_add(disk->part0.bdev, devt); 756 809 } 757 810 register_disk(parent, disk, groups); 758 811 if (register_queue) ··· 794 847 __invalidate_device(bdev, true); 795 848 796 849 /* 797 - * Unhash the bdev inode for this device so that it gets evicted as soon 798 - * as last inode reference is dropped. 850 + * Unhash the bdev inode for this device so that it can't be looked 851 + * up any more even if openers still hold references to it. 799 852 */ 800 853 remove_inode_hash(bdev->bd_inode); 801 854 bdput(bdev); ··· 837 890 * Block lookups of the disk until all bdevs are unhashed and the 838 891 * disk is marked as dead (GENHD_FL_UP cleared). 839 892 */ 840 - down_write(&disk->lookup_sem); 893 + down_write(&bdev_lookup_sem); 894 + 841 895 /* invalidate stuff */ 842 896 disk_part_iter_init(&piter, disk, 843 897 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); ··· 851 903 invalidate_partition(disk, 0); 852 904 set_capacity(disk, 0); 853 905 disk->flags &= ~GENHD_FL_UP; 854 - up_write(&disk->lookup_sem); 906 + up_write(&bdev_lookup_sem); 855 907 856 908 if (!(disk->flags & GENHD_FL_HIDDEN)) { 857 909 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); ··· 864 916 } 865 917 866 918 blk_unregister_queue(disk); 867 - 868 - if (!(disk->flags & GENHD_FL_HIDDEN)) 869 - blk_unregister_region(disk); 870 - /* 871 - * Remove gendisk pointer from idr so that it cannot be looked up 872 - * while RCU period before freeing gendisk is running to prevent 873 - * use-after-free issues. Note that the device number stays 874 - * "in-use" until we really free the gendisk. 875 - */ 876 - blk_invalidate_devt(disk_devt(disk)); 877 919 878 920 kobject_put(disk->part0.holder_dir); 879 921 kobject_put(disk->slave_dir); ··· 902 964 return badblocks_store(disk->bb, page, len, 0); 903 965 } 904 966 905 - static void request_gendisk_module(dev_t devt) 967 + void blk_request_module(dev_t devt) 906 968 { 907 969 unsigned int major = MAJOR(devt); 908 970 struct blk_major_name **n; ··· 920 982 if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) 921 983 /* Make old-style 2.4 aliases work */ 922 984 request_module("block-major-%d", MAJOR(devt)); 923 - } 924 - 925 - static bool get_disk_and_module(struct gendisk *disk) 926 - { 927 - struct module *owner; 928 - 929 - if (!disk->fops) 930 - return false; 931 - owner = disk->fops->owner; 932 - if (owner && !try_module_get(owner)) 933 - return false; 934 - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { 935 - module_put(owner); 936 - return false; 937 - } 938 - return true; 939 - 940 - } 941 - 942 - /** 943 - * get_gendisk - get partitioning information for a given device 944 - * @devt: device to get partitioning information for 945 - * @partno: returned partition index 946 - * 947 - * This function gets the structure containing partitioning 948 - * information for the given device @devt. 949 - * 950 - * Context: can sleep 951 - */ 952 - struct gendisk *get_gendisk(dev_t devt, int *partno) 953 - { 954 - struct gendisk *disk = NULL; 955 - 956 - might_sleep(); 957 - 958 - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { 959 - mutex_lock(&bdev_map_lock); 960 - disk = xa_load(&bdev_map, devt); 961 - if (!disk) { 962 - mutex_unlock(&bdev_map_lock); 963 - request_gendisk_module(devt); 964 - mutex_lock(&bdev_map_lock); 965 - disk = xa_load(&bdev_map, devt); 966 - } 967 - if (disk && !get_disk_and_module(disk)) 968 - disk = NULL; 969 - if (disk) 970 - *partno = devt - disk_devt(disk); 971 - mutex_unlock(&bdev_map_lock); 972 - } else { 973 - struct hd_struct *part; 974 - 975 - spin_lock_bh(&ext_devt_lock); 976 - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); 977 - if (part && get_disk_and_module(part_to_disk(part))) { 978 - *partno = part->partno; 979 - disk = part_to_disk(part); 980 - } 981 - spin_unlock_bh(&ext_devt_lock); 982 - } 983 - 984 - if (!disk) 985 - return NULL; 986 - 987 - /* 988 - * Synchronize with del_gendisk() to not return disk that is being 989 - * destroyed. 990 - */ 991 - down_read(&disk->lookup_sem); 992 - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || 993 - !(disk->flags & GENHD_FL_UP))) { 994 - up_read(&disk->lookup_sem); 995 - put_disk_and_module(disk); 996 - disk = NULL; 997 - } else { 998 - up_read(&disk->lookup_sem); 999 - } 1000 - return disk; 1001 985 } 1002 986 1003 987 /** ··· 1419 1559 * 1420 1560 * This function releases all allocated resources of the gendisk. 1421 1561 * 1422 - * The struct gendisk refcount is incremented with get_gendisk() or 1423 - * get_disk_and_module(), and its refcount is decremented with 1424 - * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this 1425 - * function is called. 1426 - * 1427 1562 * Drivers which used __device_add_disk() have a gendisk with a request_queue 1428 1563 * assigned. Since the request_queue sits on top of the gendisk for these 1429 1564 * drivers we also call blk_put_queue() for them, and we expect the ··· 1603 1748 if (!disk) 1604 1749 return NULL; 1605 1750 1606 - disk->part0.dkstats = alloc_percpu(struct disk_stats); 1607 - if (!disk->part0.dkstats) 1751 + disk->part0.bdev = bdev_alloc(disk, 0); 1752 + if (!disk->part0.bdev) 1608 1753 goto out_free_disk; 1609 1754 1610 - init_rwsem(&disk->lookup_sem); 1755 + disk->part0.dkstats = alloc_percpu(struct disk_stats); 1756 + if (!disk->part0.dkstats) 1757 + goto out_bdput; 1758 + 1611 1759 disk->node_id = node_id; 1612 - if (disk_expand_part_tbl(disk, 0)) { 1613 - free_percpu(disk->part0.dkstats); 1614 - goto out_free_disk; 1615 - } 1760 + if (disk_expand_part_tbl(disk, 0)) 1761 + goto out_free_bdstats; 1616 1762 1617 1763 ptbl = rcu_dereference_protected(disk->part_tbl, 1); 1618 1764 rcu_assign_pointer(ptbl->part[0], &disk->part0); ··· 1629 1773 */ 1630 1774 hd_sects_seq_init(&disk->part0); 1631 1775 if (hd_ref_init(&disk->part0)) 1632 - goto out_free_part0; 1776 + goto out_free_bdstats; 1633 1777 1634 1778 disk->minors = minors; 1635 1779 rand_initialize_disk(disk); ··· 1638 1782 device_initialize(disk_to_dev(disk)); 1639 1783 return disk; 1640 1784 1641 - out_free_part0: 1642 - hd_free_part(&disk->part0); 1785 + out_free_bdstats: 1786 + free_percpu(disk->part0.dkstats); 1787 + out_bdput: 1788 + bdput(disk->part0.bdev); 1643 1789 out_free_disk: 1644 1790 kfree(disk); 1645 1791 return NULL; ··· 1664 1806 put_device(disk_to_dev(disk)); 1665 1807 } 1666 1808 EXPORT_SYMBOL(put_disk); 1667 - 1668 - /** 1669 - * put_disk_and_module - decrements the module and gendisk refcount 1670 - * @disk: the struct gendisk to decrement the refcount for 1671 - * 1672 - * This is a counterpart of get_disk_and_module() and thus also of 1673 - * get_gendisk(). 1674 - * 1675 - * Context: Any context, but the last reference must not be dropped from 1676 - * atomic context. 1677 - */ 1678 - void put_disk_and_module(struct gendisk *disk) 1679 - { 1680 - if (disk) { 1681 - struct module *owner = disk->fops->owner; 1682 - 1683 - put_disk(disk); 1684 - module_put(owner); 1685 - } 1686 - } 1687 1809 1688 1810 static void set_disk_ro_uevent(struct gendisk *gd, int ro) 1689 1811 {
+17 -12
block/partitions/core.c
··· 340 340 device_del(part_to_dev(part)); 341 341 342 342 /* 343 - * Remove gendisk pointer from idr so that it cannot be looked up 344 - * while RCU period before freeing gendisk is running to prevent 345 - * use-after-free issues. Note that the device number stays 346 - * "in-use" until we really free the gendisk. 343 + * Remove the block device from the inode hash, so that it cannot be 344 + * looked up any more even when openers still hold references. 347 345 */ 348 - blk_invalidate_devt(part_devt(part)); 346 + remove_inode_hash(part->bdev->bd_inode); 347 + 349 348 percpu_ref_kill(&part->ref); 350 349 } 351 350 ··· 367 368 dev_t devt = MKDEV(0, 0); 368 369 struct device *ddev = disk_to_dev(disk); 369 370 struct device *pdev; 371 + struct block_device *bdev; 370 372 struct disk_part_tbl *ptbl; 371 373 const char *dname; 372 374 int err; ··· 402 402 if (!p) 403 403 return ERR_PTR(-EBUSY); 404 404 405 + err = -ENOMEM; 405 406 p->dkstats = alloc_percpu(struct disk_stats); 406 - if (!p->dkstats) { 407 - err = -ENOMEM; 407 + if (!p->dkstats) 408 408 goto out_free; 409 - } 409 + 410 + bdev = bdev_alloc(disk, partno); 411 + if (!bdev) 412 + goto out_free_stats; 413 + p->bdev = bdev; 410 414 411 415 hd_sects_seq_init(p); 412 416 pdev = part_to_dev(p); ··· 424 420 struct partition_meta_info *pinfo; 425 421 426 422 pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); 427 - if (!pinfo) { 428 - err = -ENOMEM; 429 - goto out_free_stats; 430 - } 423 + if (!pinfo) 424 + goto out_bdput; 431 425 memcpy(pinfo, info, sizeof(*info)); 432 426 p->info = pinfo; 433 427 } ··· 472 470 } 473 471 474 472 /* everything is up and running, commence */ 473 + bdev_add(bdev, devt); 475 474 rcu_assign_pointer(ptbl->part[partno], p); 476 475 477 476 /* suppress uevent if the disk suppresses it */ ··· 482 479 483 480 out_free_info: 484 481 kfree(p->info); 482 + out_bdput: 483 + bdput(bdev); 485 484 out_free_stats: 486 485 free_percpu(p->dkstats); 487 486 out_free:
+101 -76
fs/block_dev.c
··· 863 863 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 864 864 } 865 865 866 - static struct block_device *bdget(dev_t dev) 866 + struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 867 867 { 868 868 struct block_device *bdev; 869 869 struct inode *inode; 870 870 871 - inode = iget_locked(blockdev_superblock, dev); 871 + inode = new_inode(blockdev_superblock); 872 872 if (!inode) 873 873 return NULL; 874 + inode->i_mode = S_IFBLK; 875 + inode->i_rdev = 0; 876 + inode->i_data.a_ops = &def_blk_aops; 877 + mapping_set_gfp_mask(&inode->i_data, GFP_USER); 874 878 875 - bdev = &BDEV_I(inode)->bdev; 876 - 877 - if (inode->i_state & I_NEW) { 878 - spin_lock_init(&bdev->bd_size_lock); 879 - bdev->bd_contains = NULL; 880 - bdev->bd_super = NULL; 881 - bdev->bd_inode = inode; 882 - bdev->bd_part_count = 0; 883 - bdev->bd_dev = dev; 884 - inode->i_mode = S_IFBLK; 885 - inode->i_rdev = dev; 886 - inode->i_data.a_ops = &def_blk_aops; 887 - mapping_set_gfp_mask(&inode->i_data, GFP_USER); 888 - unlock_new_inode(inode); 889 - } 879 + bdev = I_BDEV(inode); 880 + spin_lock_init(&bdev->bd_size_lock); 881 + bdev->bd_disk = disk; 882 + bdev->bd_partno = partno; 883 + bdev->bd_contains = NULL; 884 + bdev->bd_super = NULL; 885 + bdev->bd_inode = inode; 886 + bdev->bd_part_count = 0; 890 887 return bdev; 888 + } 889 + 890 + void bdev_add(struct block_device *bdev, dev_t dev) 891 + { 892 + bdev->bd_dev = dev; 893 + bdev->bd_inode->i_rdev = dev; 894 + bdev->bd_inode->i_ino = dev; 895 + insert_inode_hash(bdev->bd_inode); 896 + } 897 + 898 + static struct block_device *bdget(dev_t dev) 899 + { 900 + struct inode *inode; 901 + 902 + inode = ilookup(blockdev_superblock, dev); 903 + if (!inode) 904 + return NULL; 905 + return &BDEV_I(inode)->bdev; 891 906 } 892 907 893 908 /** ··· 1018 1003 return 0; 1019 1004 } 1020 1005 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 1021 - 1022 - static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) 1023 - { 1024 - struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); 1025 - 1026 - if (!disk) 1027 - return NULL; 1028 - /* 1029 - * Now that we hold gendisk reference we make sure bdev we looked up is 1030 - * not stale. If it is, it means device got removed and created before 1031 - * we looked up gendisk and we fail open in such case. Associating 1032 - * unhashed bdev with newly created gendisk could lead to two bdevs 1033 - * (and thus two independent caches) being associated with one device 1034 - * which is bad. 1035 - */ 1036 - if (inode_unhashed(bdev->bd_inode)) { 1037 - put_disk_and_module(disk); 1038 - return NULL; 1039 - } 1040 - return disk; 1041 - } 1042 1006 1043 1007 static void bd_clear_claiming(struct block_device *whole, void *holder) 1044 1008 { ··· 1341 1347 * mutex_lock(part->bd_mutex) 1342 1348 * mutex_lock_nested(whole->bd_mutex, 1) 1343 1349 */ 1344 - static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, 1345 - int partno, fmode_t mode) 1350 + static int __blkdev_get(struct block_device *bdev, fmode_t mode) 1346 1351 { 1352 + struct gendisk *disk = bdev->bd_disk; 1347 1353 int ret; 1348 1354 1349 1355 if (!bdev->bd_openers) { 1350 - bdev->bd_disk = disk; 1351 1356 bdev->bd_contains = bdev; 1352 - bdev->bd_partno = partno; 1353 1357 1354 - if (!partno) { 1358 + if (!bdev->bd_partno) { 1355 1359 ret = -ENXIO; 1356 - bdev->bd_part = disk_get_part(disk, partno); 1360 + bdev->bd_part = disk_get_part(disk, 0); 1357 1361 if (!bdev->bd_part) 1358 1362 goto out_clear; 1359 1363 ··· 1380 1388 struct block_device *whole = bdget_disk(disk, 0); 1381 1389 1382 1390 mutex_lock_nested(&whole->bd_mutex, 1); 1383 - ret = __blkdev_get(whole, disk, 0, mode); 1391 + ret = __blkdev_get(whole, mode); 1384 1392 if (ret) { 1385 1393 mutex_unlock(&whole->bd_mutex); 1386 1394 bdput(whole); ··· 1390 1398 mutex_unlock(&whole->bd_mutex); 1391 1399 1392 1400 bdev->bd_contains = whole; 1393 - bdev->bd_part = disk_get_part(disk, partno); 1401 + bdev->bd_part = disk_get_part(disk, bdev->bd_partno); 1394 1402 if (!(disk->flags & GENHD_FL_UP) || 1395 1403 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1396 1404 __blkdev_put(whole, mode, 1); ··· 1422 1430 1423 1431 out_clear: 1424 1432 disk_put_part(bdev->bd_part); 1425 - bdev->bd_disk = NULL; 1426 1433 bdev->bd_part = NULL; 1427 1434 bdev->bd_contains = NULL; 1428 1435 return ret; 1436 + } 1437 + 1438 + struct block_device *blkdev_get_no_open(dev_t dev) 1439 + { 1440 + struct block_device *bdev; 1441 + struct gendisk *disk; 1442 + 1443 + down_read(&bdev_lookup_sem); 1444 + bdev = bdget(dev); 1445 + if (!bdev) { 1446 + up_read(&bdev_lookup_sem); 1447 + blk_request_module(dev); 1448 + down_read(&bdev_lookup_sem); 1449 + 1450 + bdev = bdget(dev); 1451 + if (!bdev) 1452 + goto unlock; 1453 + } 1454 + 1455 + disk = bdev->bd_disk; 1456 + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) 1457 + goto bdput; 1458 + if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) 1459 + goto put_disk; 1460 + if (!try_module_get(bdev->bd_disk->fops->owner)) 1461 + goto put_disk; 1462 + up_read(&bdev_lookup_sem); 1463 + return bdev; 1464 + put_disk: 1465 + put_disk(disk); 1466 + bdput: 1467 + bdput(bdev); 1468 + unlock: 1469 + up_read(&bdev_lookup_sem); 1470 + return NULL; 1471 + } 1472 + 1473 + void blkdev_put_no_open(struct block_device *bdev) 1474 + { 1475 + module_put(bdev->bd_disk->fops->owner); 1476 + put_disk(bdev->bd_disk); 1477 + bdput(bdev); 1429 1478 } 1430 1479 1431 1480 /** ··· 1496 1463 bool unblock_events = true; 1497 1464 struct block_device *bdev; 1498 1465 struct gendisk *disk; 1499 - int partno; 1500 1466 int ret; 1501 1467 1502 1468 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, ··· 1505 1473 if (ret) 1506 1474 return ERR_PTR(ret); 1507 1475 1508 - bdev = bdget(dev); 1509 - if (!bdev) 1510 - return ERR_PTR(-ENOMEM); 1511 - 1512 1476 /* 1513 1477 * If we lost a race with 'disk' being deleted, try again. See md.c. 1514 1478 */ 1515 1479 retry: 1516 - ret = -ENXIO; 1517 - disk = bdev_get_gendisk(bdev, &partno); 1518 - if (!disk) 1519 - goto bdput; 1480 + bdev = blkdev_get_no_open(dev); 1481 + if (!bdev) 1482 + return ERR_PTR(-ENXIO); 1483 + disk = bdev->bd_disk; 1520 1484 1521 1485 if (mode & FMODE_EXCL) { 1522 1486 WARN_ON_ONCE(!holder); ··· 1520 1492 ret = -ENOMEM; 1521 1493 claiming = bdget_disk(disk, 0); 1522 1494 if (!claiming) 1523 - goto put_disk; 1495 + goto put_blkdev; 1524 1496 ret = bd_prepare_to_claim(bdev, claiming, holder); 1525 1497 if (ret) 1526 1498 goto put_claiming; ··· 1529 1501 disk_block_events(disk); 1530 1502 1531 1503 mutex_lock(&bdev->bd_mutex); 1532 - ret =__blkdev_get(bdev, disk, partno, mode); 1533 - if (!(mode & FMODE_EXCL)) { 1534 - ; /* nothing to do here */ 1535 - } else if (ret) { 1536 - bd_abort_claiming(bdev, claiming, holder); 1537 - } else { 1504 + ret =__blkdev_get(bdev, mode); 1505 + if (ret) 1506 + goto abort_claiming; 1507 + if (mode & FMODE_EXCL) { 1538 1508 bd_finish_claiming(bdev, claiming, holder); 1539 1509 1540 1510 /* ··· 1552 1526 1553 1527 if (unblock_events) 1554 1528 disk_unblock_events(disk); 1529 + if (mode & FMODE_EXCL) 1530 + bdput(claiming); 1531 + return bdev; 1555 1532 1533 + abort_claiming: 1534 + if (mode & FMODE_EXCL) 1535 + bd_abort_claiming(bdev, claiming, holder); 1536 + mutex_unlock(&bdev->bd_mutex); 1537 + disk_unblock_events(disk); 1556 1538 put_claiming: 1557 1539 if (mode & FMODE_EXCL) 1558 1540 bdput(claiming); 1559 - put_disk: 1560 - if (ret) 1561 - put_disk_and_module(disk); 1541 + put_blkdev: 1542 + blkdev_put_no_open(bdev); 1562 1543 if (ret == -ERESTARTSYS) 1563 1544 goto retry; 1564 - bdput: 1565 - if (ret) { 1566 - bdput(bdev); 1567 - return ERR_PTR(ret); 1568 - } 1569 - return bdev; 1545 + return ERR_PTR(ret); 1570 1546 } 1571 1547 EXPORT_SYMBOL(blkdev_get_by_dev); 1572 1548 ··· 1669 1641 1670 1642 disk_put_part(bdev->bd_part); 1671 1643 bdev->bd_part = NULL; 1672 - bdev->bd_disk = NULL; 1673 1644 if (bdev_is_partition(bdev)) 1674 1645 victim = bdev->bd_contains; 1675 1646 bdev->bd_contains = NULL; ··· 1726 1699 * from userland - e.g. eject(1). 1727 1700 */ 1728 1701 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1729 - 1730 1702 mutex_unlock(&bdev->bd_mutex); 1731 1703 1732 1704 __blkdev_put(bdev, mode, 0); 1733 - bdput(bdev); 1734 - put_disk_and_module(disk); 1705 + blkdev_put_no_open(bdev); 1735 1706 } 1736 1707 EXPORT_SYMBOL(blkdev_put); 1737 1708
+2 -2
include/linux/blk-cgroup.h
··· 197 197 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); 198 198 199 199 struct blkg_conf_ctx { 200 - struct gendisk *disk; 200 + struct block_device *bdev; 201 201 struct blkcg_gq *blkg; 202 202 char *body; 203 203 }; 204 204 205 - struct gendisk *blkcg_conf_get_disk(char **inputp); 205 + struct block_device *blkcg_conf_open_bdev(char **inputp); 206 206 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 207 207 char *input, struct blkg_conf_ctx *ctx); 208 208 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+6
include/linux/blkdev.h
··· 1994 1994 void *holder); 1995 1995 void blkdev_put(struct block_device *bdev, fmode_t mode); 1996 1996 1997 + /* just for blk-cgroup, don't use elsewhere */ 1998 + struct block_device *blkdev_get_no_open(dev_t dev); 1999 + void blkdev_put_no_open(struct block_device *bdev); 2000 + 2001 + struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); 2002 + void bdev_add(struct block_device *bdev, dev_t dev); 1997 2003 struct block_device *I_BDEV(struct inode *inode); 1998 2004 struct block_device *bdget_part(struct hd_struct *part); 1999 2005 struct block_device *bdgrab(struct block_device *bdev);
+4 -3
include/linux/genhd.h
··· 65 65 struct disk_stats __percpu *dkstats; 66 66 struct percpu_ref ref; 67 67 68 + struct block_device *bdev; 68 69 struct device __dev; 69 70 struct kobject *holder_dir; 70 71 int policy, partno; ··· 194 193 int flags; 195 194 unsigned long state; 196 195 #define GD_NEED_PART_SCAN 0 197 - struct rw_semaphore lookup_sem; 198 196 struct kobject *slave_dir; 199 197 200 198 struct timer_rand_state *random; ··· 300 300 } 301 301 302 302 extern void del_gendisk(struct gendisk *gp); 303 - extern struct gendisk *get_gendisk(dev_t dev, int *partno); 304 303 extern struct block_device *bdget_disk(struct gendisk *disk, int partno); 305 304 306 305 extern void set_disk_ro(struct gendisk *disk, int flag); ··· 337 338 338 339 extern struct gendisk *__alloc_disk_node(int minors, int node_id); 339 340 extern void put_disk(struct gendisk *disk); 340 - extern void put_disk_and_module(struct gendisk *disk); 341 341 342 342 #define alloc_disk_node(minors, node_id) \ 343 343 ({ \ ··· 386 388 } 387 389 #endif /* CONFIG_SYSFS */ 388 390 391 + extern struct rw_semaphore bdev_lookup_sem; 392 + 389 393 dev_t blk_lookup_devt(const char *name, int partno); 394 + void blk_request_module(dev_t devt); 390 395 #ifdef CONFIG_BLOCK 391 396 void printk_all_partitions(void); 392 397 #else /* CONFIG_BLOCK */