at v4.9 51 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/device_cgroup.h> 15#include <linux/highmem.h> 16#include <linux/blkdev.h> 17#include <linux/backing-dev.h> 18#include <linux/module.h> 19#include <linux/blkpg.h> 20#include <linux/magic.h> 21#include <linux/buffer_head.h> 22#include <linux/swap.h> 23#include <linux/pagevec.h> 24#include <linux/writeback.h> 25#include <linux/mpage.h> 26#include <linux/mount.h> 27#include <linux/uio.h> 28#include <linux/namei.h> 29#include <linux/log2.h> 30#include <linux/cleancache.h> 31#include <linux/dax.h> 32#include <linux/badblocks.h> 33#include <linux/falloc.h> 34#include <asm/uaccess.h> 35#include "internal.h" 36 37struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40}; 41 42static const struct address_space_operations def_blk_aops; 43 44static inline struct bdev_inode *BDEV_I(struct inode *inode) 45{ 46 return container_of(inode, struct bdev_inode, vfs_inode); 47} 48 49struct block_device *I_BDEV(struct inode *inode) 50{ 51 return &BDEV_I(inode)->bdev; 52} 53EXPORT_SYMBOL(I_BDEV); 54 55void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 56{ 57 struct va_format vaf; 58 va_list args; 59 60 va_start(args, fmt); 61 vaf.fmt = fmt; 62 vaf.va = &args; 63 printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); 64 va_end(args); 65} 66 67static void bdev_write_inode(struct block_device *bdev) 68{ 69 struct inode *inode = bdev->bd_inode; 70 int ret; 71 72 spin_lock(&inode->i_lock); 73 while (inode->i_state & I_DIRTY) { 74 spin_unlock(&inode->i_lock); 75 ret = write_inode_now(inode, true); 76 if (ret) { 77 char name[BDEVNAME_SIZE]; 78 pr_warn_ratelimited("VFS: Dirty inode writeback failed " 79 "for block device %s (err=%d).\n", 80 bdevname(bdev, name), ret); 81 } 82 spin_lock(&inode->i_lock); 83 } 84 spin_unlock(&inode->i_lock); 85} 86 87/* Kill _all_ buffers and pagecache , dirty or not.. */ 88void kill_bdev(struct block_device *bdev) 89{ 90 struct address_space *mapping = bdev->bd_inode->i_mapping; 91 92 if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 93 return; 94 95 invalidate_bh_lrus(); 96 truncate_inode_pages(mapping, 0); 97} 98EXPORT_SYMBOL(kill_bdev); 99 100/* Invalidate clean unused buffers and pagecache. */ 101void invalidate_bdev(struct block_device *bdev) 102{ 103 struct address_space *mapping = bdev->bd_inode->i_mapping; 104 105 if (mapping->nrpages == 0) 106 return; 107 108 invalidate_bh_lrus(); 109 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 110 invalidate_mapping_pages(mapping, 0, -1); 111 /* 99% of the time, we don't need to flush the cleancache on the bdev. 112 * But, for the strange corners, lets be cautious 113 */ 114 cleancache_invalidate_inode(mapping); 115} 116EXPORT_SYMBOL(invalidate_bdev); 117 118int set_blocksize(struct block_device *bdev, int size) 119{ 120 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 121 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 122 return -EINVAL; 123 124 /* Size cannot be smaller than the size supported by the device */ 125 if (size < bdev_logical_block_size(bdev)) 126 return -EINVAL; 127 128 /* Don't change the size if it is same as current */ 129 if (bdev->bd_block_size != size) { 130 sync_blockdev(bdev); 131 bdev->bd_block_size = size; 132 bdev->bd_inode->i_blkbits = blksize_bits(size); 133 kill_bdev(bdev); 134 } 135 return 0; 136} 137 138EXPORT_SYMBOL(set_blocksize); 139 140int sb_set_blocksize(struct super_block *sb, int size) 141{ 142 if (set_blocksize(sb->s_bdev, size)) 143 return 0; 144 /* If we get here, we know size is power of two 145 * and it's value is between 512 and PAGE_SIZE */ 146 sb->s_blocksize = size; 147 sb->s_blocksize_bits = blksize_bits(size); 148 return sb->s_blocksize; 149} 150 151EXPORT_SYMBOL(sb_set_blocksize); 152 153int sb_min_blocksize(struct super_block *sb, int size) 154{ 155 int minsize = bdev_logical_block_size(sb->s_bdev); 156 if (size < minsize) 157 size = minsize; 158 return sb_set_blocksize(sb, size); 159} 160 161EXPORT_SYMBOL(sb_min_blocksize); 162 163static int 164blkdev_get_block(struct inode *inode, sector_t iblock, 165 struct buffer_head *bh, int create) 166{ 167 bh->b_bdev = I_BDEV(inode); 168 bh->b_blocknr = iblock; 169 set_buffer_mapped(bh); 170 return 0; 171} 172 173static struct inode *bdev_file_inode(struct file *file) 174{ 175 return file->f_mapping->host; 176} 177 178static ssize_t 179blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 180{ 181 struct file *file = iocb->ki_filp; 182 struct inode *inode = bdev_file_inode(file); 183 184 return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, 185 blkdev_get_block, NULL, NULL, 186 DIO_SKIP_DIO_COUNT); 187} 188 189int __sync_blockdev(struct block_device *bdev, int wait) 190{ 191 if (!bdev) 192 return 0; 193 if (!wait) 194 return filemap_flush(bdev->bd_inode->i_mapping); 195 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 196} 197 198/* 199 * Write out and wait upon all the dirty data associated with a block 200 * device via its mapping. Does not take the superblock lock. 201 */ 202int sync_blockdev(struct block_device *bdev) 203{ 204 return __sync_blockdev(bdev, 1); 205} 206EXPORT_SYMBOL(sync_blockdev); 207 208/* 209 * Write out and wait upon all dirty data associated with this 210 * device. Filesystem data as well as the underlying block 211 * device. Takes the superblock lock. 212 */ 213int fsync_bdev(struct block_device *bdev) 214{ 215 struct super_block *sb = get_super(bdev); 216 if (sb) { 217 int res = sync_filesystem(sb); 218 drop_super(sb); 219 return res; 220 } 221 return sync_blockdev(bdev); 222} 223EXPORT_SYMBOL(fsync_bdev); 224 225/** 226 * freeze_bdev -- lock a filesystem and force it into a consistent state 227 * @bdev: blockdevice to lock 228 * 229 * If a superblock is found on this device, we take the s_umount semaphore 230 * on it to make sure nobody unmounts until the snapshot creation is done. 231 * The reference counter (bd_fsfreeze_count) guarantees that only the last 232 * unfreeze process can unfreeze the frozen filesystem actually when multiple 233 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 234 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 235 * actually. 236 */ 237struct super_block *freeze_bdev(struct block_device *bdev) 238{ 239 struct super_block *sb; 240 int error = 0; 241 242 mutex_lock(&bdev->bd_fsfreeze_mutex); 243 if (++bdev->bd_fsfreeze_count > 1) { 244 /* 245 * We don't even need to grab a reference - the first call 246 * to freeze_bdev grab an active reference and only the last 247 * thaw_bdev drops it. 248 */ 249 sb = get_super(bdev); 250 if (sb) 251 drop_super(sb); 252 mutex_unlock(&bdev->bd_fsfreeze_mutex); 253 return sb; 254 } 255 256 sb = get_active_super(bdev); 257 if (!sb) 258 goto out; 259 if (sb->s_op->freeze_super) 260 error = sb->s_op->freeze_super(sb); 261 else 262 error = freeze_super(sb); 263 if (error) { 264 deactivate_super(sb); 265 bdev->bd_fsfreeze_count--; 266 mutex_unlock(&bdev->bd_fsfreeze_mutex); 267 return ERR_PTR(error); 268 } 269 deactivate_super(sb); 270 out: 271 sync_blockdev(bdev); 272 mutex_unlock(&bdev->bd_fsfreeze_mutex); 273 return sb; /* thaw_bdev releases s->s_umount */ 274} 275EXPORT_SYMBOL(freeze_bdev); 276 277/** 278 * thaw_bdev -- unlock filesystem 279 * @bdev: blockdevice to unlock 280 * @sb: associated superblock 281 * 282 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 283 */ 284int thaw_bdev(struct block_device *bdev, struct super_block *sb) 285{ 286 int error = -EINVAL; 287 288 mutex_lock(&bdev->bd_fsfreeze_mutex); 289 if (!bdev->bd_fsfreeze_count) 290 goto out; 291 292 error = 0; 293 if (--bdev->bd_fsfreeze_count > 0) 294 goto out; 295 296 if (!sb) 297 goto out; 298 299 if (sb->s_op->thaw_super) 300 error = sb->s_op->thaw_super(sb); 301 else 302 error = thaw_super(sb); 303 if (error) 304 bdev->bd_fsfreeze_count++; 305out: 306 mutex_unlock(&bdev->bd_fsfreeze_mutex); 307 return error; 308} 309EXPORT_SYMBOL(thaw_bdev); 310 311static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 312{ 313 return block_write_full_page(page, blkdev_get_block, wbc); 314} 315 316static int blkdev_readpage(struct file * file, struct page * page) 317{ 318 return block_read_full_page(page, blkdev_get_block); 319} 320 321static int blkdev_readpages(struct file *file, struct address_space *mapping, 322 struct list_head *pages, unsigned nr_pages) 323{ 324 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); 325} 326 327static int blkdev_write_begin(struct file *file, struct address_space *mapping, 328 loff_t pos, unsigned len, unsigned flags, 329 struct page **pagep, void **fsdata) 330{ 331 return block_write_begin(mapping, pos, len, flags, pagep, 332 blkdev_get_block); 333} 334 335static int blkdev_write_end(struct file *file, struct address_space *mapping, 336 loff_t pos, unsigned len, unsigned copied, 337 struct page *page, void *fsdata) 338{ 339 int ret; 340 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 341 342 unlock_page(page); 343 put_page(page); 344 345 return ret; 346} 347 348/* 349 * private llseek: 350 * for a block special file file_inode(file)->i_size is zero 351 * so we compute the size by hand (just as in block_read/write above) 352 */ 353static loff_t block_llseek(struct file *file, loff_t offset, int whence) 354{ 355 struct inode *bd_inode = bdev_file_inode(file); 356 loff_t retval; 357 358 inode_lock(bd_inode); 359 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); 360 inode_unlock(bd_inode); 361 return retval; 362} 363 364int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 365{ 366 struct inode *bd_inode = bdev_file_inode(filp); 367 struct block_device *bdev = I_BDEV(bd_inode); 368 int error; 369 370 error = filemap_write_and_wait_range(filp->f_mapping, start, end); 371 if (error) 372 return error; 373 374 /* 375 * There is no need to serialise calls to blkdev_issue_flush with 376 * i_mutex and doing so causes performance issues with concurrent 377 * O_SYNC writers to a block device. 378 */ 379 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); 380 if (error == -EOPNOTSUPP) 381 error = 0; 382 383 return error; 384} 385EXPORT_SYMBOL(blkdev_fsync); 386 387/** 388 * bdev_read_page() - Start reading a page from a block device 389 * @bdev: The device to read the page from 390 * @sector: The offset on the device to read the page to (need not be aligned) 391 * @page: The page to read 392 * 393 * On entry, the page should be locked. It will be unlocked when the page 394 * has been read. If the block driver implements rw_page synchronously, 395 * that will be true on exit from this function, but it need not be. 396 * 397 * Errors returned by this function are usually "soft", eg out of memory, or 398 * queue full; callers should try a different route to read this page rather 399 * than propagate an error back up the stack. 400 * 401 * Return: negative errno if an error occurs, 0 if submission was successful. 402 */ 403int bdev_read_page(struct block_device *bdev, sector_t sector, 404 struct page *page) 405{ 406 const struct block_device_operations *ops = bdev->bd_disk->fops; 407 int result = -EOPNOTSUPP; 408 409 if (!ops->rw_page || bdev_get_integrity(bdev)) 410 return result; 411 412 result = blk_queue_enter(bdev->bd_queue, false); 413 if (result) 414 return result; 415 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); 416 blk_queue_exit(bdev->bd_queue); 417 return result; 418} 419EXPORT_SYMBOL_GPL(bdev_read_page); 420 421/** 422 * bdev_write_page() - Start writing a page to a block device 423 * @bdev: The device to write the page to 424 * @sector: The offset on the device to write the page to (need not be aligned) 425 * @page: The page to write 426 * @wbc: The writeback_control for the write 427 * 428 * On entry, the page should be locked and not currently under writeback. 429 * On exit, if the write started successfully, the page will be unlocked and 430 * under writeback. If the write failed already (eg the driver failed to 431 * queue the page to the device), the page will still be locked. If the 432 * caller is a ->writepage implementation, it will need to unlock the page. 433 * 434 * Errors returned by this function are usually "soft", eg out of memory, or 435 * queue full; callers should try a different route to write this page rather 436 * than propagate an error back up the stack. 437 * 438 * Return: negative errno if an error occurs, 0 if submission was successful. 439 */ 440int bdev_write_page(struct block_device *bdev, sector_t sector, 441 struct page *page, struct writeback_control *wbc) 442{ 443 int result; 444 const struct block_device_operations *ops = bdev->bd_disk->fops; 445 446 if (!ops->rw_page || bdev_get_integrity(bdev)) 447 return -EOPNOTSUPP; 448 result = blk_queue_enter(bdev->bd_queue, false); 449 if (result) 450 return result; 451 452 set_page_writeback(page); 453 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); 454 if (result) 455 end_page_writeback(page); 456 else 457 unlock_page(page); 458 blk_queue_exit(bdev->bd_queue); 459 return result; 460} 461EXPORT_SYMBOL_GPL(bdev_write_page); 462 463/** 464 * bdev_direct_access() - Get the address for directly-accessibly memory 465 * @bdev: The device containing the memory 466 * @dax: control and output parameters for ->direct_access 467 * 468 * If a block device is made up of directly addressable memory, this function 469 * will tell the caller the PFN and the address of the memory. The address 470 * may be directly dereferenced within the kernel without the need to call 471 * ioremap(), kmap() or similar. The PFN is suitable for inserting into 472 * page tables. 473 * 474 * Return: negative errno if an error occurs, otherwise the number of bytes 475 * accessible at this address. 476 */ 477long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) 478{ 479 sector_t sector = dax->sector; 480 long avail, size = dax->size; 481 const struct block_device_operations *ops = bdev->bd_disk->fops; 482 483 /* 484 * The device driver is allowed to sleep, in order to make the 485 * memory directly accessible. 486 */ 487 might_sleep(); 488 489 if (size < 0) 490 return size; 491 if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) 492 return -EOPNOTSUPP; 493 if ((sector + DIV_ROUND_UP(size, 512)) > 494 part_nr_sects_read(bdev->bd_part)) 495 return -ERANGE; 496 sector += get_start_sect(bdev); 497 if (sector % (PAGE_SIZE / 512)) 498 return -EINVAL; 499 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); 500 if (!avail) 501 return -ERANGE; 502 if (avail > 0 && avail & ~PAGE_MASK) 503 return -ENXIO; 504 return min(avail, size); 505} 506EXPORT_SYMBOL_GPL(bdev_direct_access); 507 508/** 509 * bdev_dax_supported() - Check if the device supports dax for filesystem 510 * @sb: The superblock of the device 511 * @blocksize: The block size of the device 512 * 513 * This is a library function for filesystems to check if the block device 514 * can be mounted with dax option. 515 * 516 * Return: negative errno if unsupported, 0 if supported. 517 */ 518int bdev_dax_supported(struct super_block *sb, int blocksize) 519{ 520 struct blk_dax_ctl dax = { 521 .sector = 0, 522 .size = PAGE_SIZE, 523 }; 524 int err; 525 526 if (blocksize != PAGE_SIZE) { 527 vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); 528 return -EINVAL; 529 } 530 531 err = bdev_direct_access(sb->s_bdev, &dax); 532 if (err < 0) { 533 switch (err) { 534 case -EOPNOTSUPP: 535 vfs_msg(sb, KERN_ERR, 536 "error: device does not support dax"); 537 break; 538 case -EINVAL: 539 vfs_msg(sb, KERN_ERR, 540 "error: unaligned partition for dax"); 541 break; 542 default: 543 vfs_msg(sb, KERN_ERR, 544 "error: dax access failed (%d)", err); 545 } 546 return err; 547 } 548 549 return 0; 550} 551EXPORT_SYMBOL_GPL(bdev_dax_supported); 552 553/** 554 * bdev_dax_capable() - Return if the raw device is capable for dax 555 * @bdev: The device for raw block device access 556 */ 557bool bdev_dax_capable(struct block_device *bdev) 558{ 559 struct blk_dax_ctl dax = { 560 .size = PAGE_SIZE, 561 }; 562 563 if (!IS_ENABLED(CONFIG_FS_DAX)) 564 return false; 565 566 dax.sector = 0; 567 if (bdev_direct_access(bdev, &dax) < 0) 568 return false; 569 570 dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); 571 if (bdev_direct_access(bdev, &dax) < 0) 572 return false; 573 574 return true; 575} 576 577/* 578 * pseudo-fs 579 */ 580 581static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 582static struct kmem_cache * bdev_cachep __read_mostly; 583 584static struct inode *bdev_alloc_inode(struct super_block *sb) 585{ 586 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 587 if (!ei) 588 return NULL; 589 return &ei->vfs_inode; 590} 591 592static void bdev_i_callback(struct rcu_head *head) 593{ 594 struct inode *inode = container_of(head, struct inode, i_rcu); 595 struct bdev_inode *bdi = BDEV_I(inode); 596 597 kmem_cache_free(bdev_cachep, bdi); 598} 599 600static void bdev_destroy_inode(struct inode *inode) 601{ 602 call_rcu(&inode->i_rcu, bdev_i_callback); 603} 604 605static void init_once(void *foo) 606{ 607 struct bdev_inode *ei = (struct bdev_inode *) foo; 608 struct block_device *bdev = &ei->bdev; 609 610 memset(bdev, 0, sizeof(*bdev)); 611 mutex_init(&bdev->bd_mutex); 612 INIT_LIST_HEAD(&bdev->bd_list); 613#ifdef CONFIG_SYSFS 614 INIT_LIST_HEAD(&bdev->bd_holder_disks); 615#endif 616 inode_init_once(&ei->vfs_inode); 617 /* Initialize mutex for freeze. */ 618 mutex_init(&bdev->bd_fsfreeze_mutex); 619} 620 621static void bdev_evict_inode(struct inode *inode) 622{ 623 struct block_device *bdev = &BDEV_I(inode)->bdev; 624 truncate_inode_pages_final(&inode->i_data); 625 invalidate_inode_buffers(inode); /* is it needed here? */ 626 clear_inode(inode); 627 spin_lock(&bdev_lock); 628 list_del_init(&bdev->bd_list); 629 spin_unlock(&bdev_lock); 630} 631 632static const struct super_operations bdev_sops = { 633 .statfs = simple_statfs, 634 .alloc_inode = bdev_alloc_inode, 635 .destroy_inode = bdev_destroy_inode, 636 .drop_inode = generic_delete_inode, 637 .evict_inode = bdev_evict_inode, 638}; 639 640static struct dentry *bd_mount(struct file_system_type *fs_type, 641 int flags, const char *dev_name, void *data) 642{ 643 struct dentry *dent; 644 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); 645 if (!IS_ERR(dent)) 646 dent->d_sb->s_iflags |= SB_I_CGROUPWB; 647 return dent; 648} 649 650static struct file_system_type bd_type = { 651 .name = "bdev", 652 .mount = bd_mount, 653 .kill_sb = kill_anon_super, 654}; 655 656struct super_block *blockdev_superblock __read_mostly; 657EXPORT_SYMBOL_GPL(blockdev_superblock); 658 659void __init bdev_cache_init(void) 660{ 661 int err; 662 static struct vfsmount *bd_mnt; 663 664 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 665 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 666 SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), 667 init_once); 668 err = register_filesystem(&bd_type); 669 if (err) 670 panic("Cannot register bdev pseudo-fs"); 671 bd_mnt = kern_mount(&bd_type); 672 if (IS_ERR(bd_mnt)) 673 panic("Cannot create bdev pseudo-fs"); 674 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 675} 676 677/* 678 * Most likely _very_ bad one - but then it's hardly critical for small 679 * /dev and can be fixed when somebody will need really large one. 680 * Keep in mind that it will be fed through icache hash function too. 681 */ 682static inline unsigned long hash(dev_t dev) 683{ 684 return MAJOR(dev)+MINOR(dev); 685} 686 687static int bdev_test(struct inode *inode, void *data) 688{ 689 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 690} 691 692static int bdev_set(struct inode *inode, void *data) 693{ 694 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 695 return 0; 696} 697 698static LIST_HEAD(all_bdevs); 699 700struct block_device *bdget(dev_t dev) 701{ 702 struct block_device *bdev; 703 struct inode *inode; 704 705 inode = iget5_locked(blockdev_superblock, hash(dev), 706 bdev_test, bdev_set, &dev); 707 708 if (!inode) 709 return NULL; 710 711 bdev = &BDEV_I(inode)->bdev; 712 713 if (inode->i_state & I_NEW) { 714 bdev->bd_contains = NULL; 715 bdev->bd_super = NULL; 716 bdev->bd_inode = inode; 717 bdev->bd_block_size = (1 << inode->i_blkbits); 718 bdev->bd_part_count = 0; 719 bdev->bd_invalidated = 0; 720 inode->i_mode = S_IFBLK; 721 inode->i_rdev = dev; 722 inode->i_bdev = bdev; 723 inode->i_data.a_ops = &def_blk_aops; 724 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 725 spin_lock(&bdev_lock); 726 list_add(&bdev->bd_list, &all_bdevs); 727 spin_unlock(&bdev_lock); 728 unlock_new_inode(inode); 729 } 730 return bdev; 731} 732 733EXPORT_SYMBOL(bdget); 734 735/** 736 * bdgrab -- Grab a reference to an already referenced block device 737 * @bdev: Block device to grab a reference to. 738 */ 739struct block_device *bdgrab(struct block_device *bdev) 740{ 741 ihold(bdev->bd_inode); 742 return bdev; 743} 744EXPORT_SYMBOL(bdgrab); 745 746long nr_blockdev_pages(void) 747{ 748 struct block_device *bdev; 749 long ret = 0; 750 spin_lock(&bdev_lock); 751 list_for_each_entry(bdev, &all_bdevs, bd_list) { 752 ret += bdev->bd_inode->i_mapping->nrpages; 753 } 754 spin_unlock(&bdev_lock); 755 return ret; 756} 757 758void bdput(struct block_device *bdev) 759{ 760 iput(bdev->bd_inode); 761} 762 763EXPORT_SYMBOL(bdput); 764 765static struct block_device *bd_acquire(struct inode *inode) 766{ 767 struct block_device *bdev; 768 769 spin_lock(&bdev_lock); 770 bdev = inode->i_bdev; 771 if (bdev) { 772 bdgrab(bdev); 773 spin_unlock(&bdev_lock); 774 return bdev; 775 } 776 spin_unlock(&bdev_lock); 777 778 bdev = bdget(inode->i_rdev); 779 if (bdev) { 780 spin_lock(&bdev_lock); 781 if (!inode->i_bdev) { 782 /* 783 * We take an additional reference to bd_inode, 784 * and it's released in clear_inode() of inode. 785 * So, we can access it via ->i_mapping always 786 * without igrab(). 787 */ 788 bdgrab(bdev); 789 inode->i_bdev = bdev; 790 inode->i_mapping = bdev->bd_inode->i_mapping; 791 } 792 spin_unlock(&bdev_lock); 793 } 794 return bdev; 795} 796 797/* Call when you free inode */ 798 799void bd_forget(struct inode *inode) 800{ 801 struct block_device *bdev = NULL; 802 803 spin_lock(&bdev_lock); 804 if (!sb_is_blkdev_sb(inode->i_sb)) 805 bdev = inode->i_bdev; 806 inode->i_bdev = NULL; 807 inode->i_mapping = &inode->i_data; 808 spin_unlock(&bdev_lock); 809 810 if (bdev) 811 bdput(bdev); 812} 813 814/** 815 * bd_may_claim - test whether a block device can be claimed 816 * @bdev: block device of interest 817 * @whole: whole block device containing @bdev, may equal @bdev 818 * @holder: holder trying to claim @bdev 819 * 820 * Test whether @bdev can be claimed by @holder. 821 * 822 * CONTEXT: 823 * spin_lock(&bdev_lock). 824 * 825 * RETURNS: 826 * %true if @bdev can be claimed, %false otherwise. 827 */ 828static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 829 void *holder) 830{ 831 if (bdev->bd_holder == holder) 832 return true; /* already a holder */ 833 else if (bdev->bd_holder != NULL) 834 return false; /* held by someone else */ 835 else if (bdev->bd_contains == bdev) 836 return true; /* is a whole device which isn't held */ 837 838 else if (whole->bd_holder == bd_may_claim) 839 return true; /* is a partition of a device that is being partitioned */ 840 else if (whole->bd_holder != NULL) 841 return false; /* is a partition of a held device */ 842 else 843 return true; /* is a partition of an un-held device */ 844} 845 846/** 847 * bd_prepare_to_claim - prepare to claim a block device 848 * @bdev: block device of interest 849 * @whole: the whole device containing @bdev, may equal @bdev 850 * @holder: holder trying to claim @bdev 851 * 852 * Prepare to claim @bdev. This function fails if @bdev is already 853 * claimed by another holder and waits if another claiming is in 854 * progress. This function doesn't actually claim. On successful 855 * return, the caller has ownership of bd_claiming and bd_holder[s]. 856 * 857 * CONTEXT: 858 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 859 * it multiple times. 860 * 861 * RETURNS: 862 * 0 if @bdev can be claimed, -EBUSY otherwise. 863 */ 864static int bd_prepare_to_claim(struct block_device *bdev, 865 struct block_device *whole, void *holder) 866{ 867retry: 868 /* if someone else claimed, fail */ 869 if (!bd_may_claim(bdev, whole, holder)) 870 return -EBUSY; 871 872 /* if claiming is already in progress, wait for it to finish */ 873 if (whole->bd_claiming) { 874 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 875 DEFINE_WAIT(wait); 876 877 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 878 spin_unlock(&bdev_lock); 879 schedule(); 880 finish_wait(wq, &wait); 881 spin_lock(&bdev_lock); 882 goto retry; 883 } 884 885 /* yay, all mine */ 886 return 0; 887} 888 889/** 890 * bd_start_claiming - start claiming a block device 891 * @bdev: block device of interest 892 * @holder: holder trying to claim @bdev 893 * 894 * @bdev is about to be opened exclusively. Check @bdev can be opened 895 * exclusively and mark that an exclusive open is in progress. Each 896 * successful call to this function must be matched with a call to 897 * either bd_finish_claiming() or bd_abort_claiming() (which do not 898 * fail). 899 * 900 * This function is used to gain exclusive access to the block device 901 * without actually causing other exclusive open attempts to fail. It 902 * should be used when the open sequence itself requires exclusive 903 * access but may subsequently fail. 904 * 905 * CONTEXT: 906 * Might sleep. 907 * 908 * RETURNS: 909 * Pointer to the block device containing @bdev on success, ERR_PTR() 910 * value on failure. 911 */ 912static struct block_device *bd_start_claiming(struct block_device *bdev, 913 void *holder) 914{ 915 struct gendisk *disk; 916 struct block_device *whole; 917 int partno, err; 918 919 might_sleep(); 920 921 /* 922 * @bdev might not have been initialized properly yet, look up 923 * and grab the outer block device the hard way. 924 */ 925 disk = get_gendisk(bdev->bd_dev, &partno); 926 if (!disk) 927 return ERR_PTR(-ENXIO); 928 929 /* 930 * Normally, @bdev should equal what's returned from bdget_disk() 931 * if partno is 0; however, some drivers (floppy) use multiple 932 * bdev's for the same physical device and @bdev may be one of the 933 * aliases. Keep @bdev if partno is 0. This means claimer 934 * tracking is broken for those devices but it has always been that 935 * way. 936 */ 937 if (partno) 938 whole = bdget_disk(disk, 0); 939 else 940 whole = bdgrab(bdev); 941 942 module_put(disk->fops->owner); 943 put_disk(disk); 944 if (!whole) 945 return ERR_PTR(-ENOMEM); 946 947 /* prepare to claim, if successful, mark claiming in progress */ 948 spin_lock(&bdev_lock); 949 950 err = bd_prepare_to_claim(bdev, whole, holder); 951 if (err == 0) { 952 whole->bd_claiming = holder; 953 spin_unlock(&bdev_lock); 954 return whole; 955 } else { 956 spin_unlock(&bdev_lock); 957 bdput(whole); 958 return ERR_PTR(err); 959 } 960} 961 962#ifdef CONFIG_SYSFS 963struct bd_holder_disk { 964 struct list_head list; 965 struct gendisk *disk; 966 int refcnt; 967}; 968 969static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, 970 struct gendisk *disk) 971{ 972 struct bd_holder_disk *holder; 973 974 list_for_each_entry(holder, &bdev->bd_holder_disks, list) 975 if (holder->disk == disk) 976 return holder; 977 return NULL; 978} 979 980static int add_symlink(struct kobject *from, struct kobject *to) 981{ 982 return sysfs_create_link(from, to, kobject_name(to)); 983} 984 985static void del_symlink(struct kobject *from, struct kobject *to) 986{ 987 sysfs_remove_link(from, kobject_name(to)); 988} 989 990/** 991 * bd_link_disk_holder - create symlinks between holding disk and slave bdev 992 * @bdev: the claimed slave bdev 993 * @disk: the holding disk 994 * 995 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 996 * 997 * This functions creates the following sysfs symlinks. 998 * 999 * - from "slaves" directory of the holder @disk to the claimed @bdev 1000 * - from "holders" directory of the @bdev to the holder @disk 1001 * 1002 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is 1003 * passed to bd_link_disk_holder(), then: 1004 * 1005 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 1006 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 1007 * 1008 * The caller must have claimed @bdev before calling this function and 1009 * ensure that both @bdev and @disk are valid during the creation and 1010 * lifetime of these symlinks. 1011 * 1012 * CONTEXT: 1013 * Might sleep. 1014 * 1015 * RETURNS: 1016 * 0 on success, -errno on failure. 1017 */ 1018int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) 1019{ 1020 struct bd_holder_disk *holder; 1021 int ret = 0; 1022 1023 mutex_lock(&bdev->bd_mutex); 1024 1025 WARN_ON_ONCE(!bdev->bd_holder); 1026 1027 /* FIXME: remove the following once add_disk() handles errors */ 1028 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) 1029 goto out_unlock; 1030 1031 holder = bd_find_holder_disk(bdev, disk); 1032 if (holder) { 1033 holder->refcnt++; 1034 goto out_unlock; 1035 } 1036 1037 holder = kzalloc(sizeof(*holder), GFP_KERNEL); 1038 if (!holder) { 1039 ret = -ENOMEM; 1040 goto out_unlock; 1041 } 1042 1043 INIT_LIST_HEAD(&holder->list); 1044 holder->disk = disk; 1045 holder->refcnt = 1; 1046 1047 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 1048 if (ret) 1049 goto out_free; 1050 1051 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 1052 if (ret) 1053 goto out_del; 1054 /* 1055 * bdev could be deleted beneath us which would implicitly destroy 1056 * the holder directory. Hold on to it. 1057 */ 1058 kobject_get(bdev->bd_part->holder_dir); 1059 1060 list_add(&holder->list, &bdev->bd_holder_disks); 1061 goto out_unlock; 1062 1063out_del: 1064 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 1065out_free: 1066 kfree(holder); 1067out_unlock: 1068 mutex_unlock(&bdev->bd_mutex); 1069 return ret; 1070} 1071EXPORT_SYMBOL_GPL(bd_link_disk_holder); 1072 1073/** 1074 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() 1075 * @bdev: the calimed slave bdev 1076 * @disk: the holding disk 1077 * 1078 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 1079 * 1080 * CONTEXT: 1081 * Might sleep. 1082 */ 1083void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) 1084{ 1085 struct bd_holder_disk *holder; 1086 1087 mutex_lock(&bdev->bd_mutex); 1088 1089 holder = bd_find_holder_disk(bdev, disk); 1090 1091 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { 1092 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 1093 del_symlink(bdev->bd_part->holder_dir, 1094 &disk_to_dev(disk)->kobj); 1095 kobject_put(bdev->bd_part->holder_dir); 1096 list_del_init(&holder->list); 1097 kfree(holder); 1098 } 1099 1100 mutex_unlock(&bdev->bd_mutex); 1101} 1102EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); 1103#endif 1104 1105/** 1106 * flush_disk - invalidates all buffer-cache entries on a disk 1107 * 1108 * @bdev: struct block device to be flushed 1109 * @kill_dirty: flag to guide handling of dirty inodes 1110 * 1111 * Invalidates all buffer-cache entries on a disk. It should be called 1112 * when a disk has been changed -- either by a media change or online 1113 * resize. 1114 */ 1115static void flush_disk(struct block_device *bdev, bool kill_dirty) 1116{ 1117 if (__invalidate_device(bdev, kill_dirty)) { 1118 printk(KERN_WARNING "VFS: busy inodes on changed media or " 1119 "resized disk %s\n", 1120 bdev->bd_disk ? bdev->bd_disk->disk_name : ""); 1121 } 1122 1123 if (!bdev->bd_disk) 1124 return; 1125 if (disk_part_scan_enabled(bdev->bd_disk)) 1126 bdev->bd_invalidated = 1; 1127} 1128 1129/** 1130 * check_disk_size_change - checks for disk size change and adjusts bdev size. 1131 * @disk: struct gendisk to check 1132 * @bdev: struct bdev to adjust. 1133 * 1134 * This routine checks to see if the bdev size does not match the disk size 1135 * and adjusts it if it differs. 1136 */ 1137void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1138{ 1139 loff_t disk_size, bdev_size; 1140 1141 disk_size = (loff_t)get_capacity(disk) << 9; 1142 bdev_size = i_size_read(bdev->bd_inode); 1143 if (disk_size != bdev_size) { 1144 printk(KERN_INFO 1145 "%s: detected capacity change from %lld to %lld\n", 1146 disk->disk_name, bdev_size, disk_size); 1147 i_size_write(bdev->bd_inode, disk_size); 1148 flush_disk(bdev, false); 1149 } 1150} 1151EXPORT_SYMBOL(check_disk_size_change); 1152 1153/** 1154 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1155 * @disk: struct gendisk to be revalidated 1156 * 1157 * This routine is a wrapper for lower-level driver's revalidate_disk 1158 * call-backs. It is used to do common pre and post operations needed 1159 * for all revalidate_disk operations. 1160 */ 1161int revalidate_disk(struct gendisk *disk) 1162{ 1163 struct block_device *bdev; 1164 int ret = 0; 1165 1166 if (disk->fops->revalidate_disk) 1167 ret = disk->fops->revalidate_disk(disk); 1168 blk_integrity_revalidate(disk); 1169 bdev = bdget_disk(disk, 0); 1170 if (!bdev) 1171 return ret; 1172 1173 mutex_lock(&bdev->bd_mutex); 1174 check_disk_size_change(disk, bdev); 1175 bdev->bd_invalidated = 0; 1176 mutex_unlock(&bdev->bd_mutex); 1177 bdput(bdev); 1178 return ret; 1179} 1180EXPORT_SYMBOL(revalidate_disk); 1181 1182/* 1183 * This routine checks whether a removable media has been changed, 1184 * and invalidates all buffer-cache-entries in that case. This 1185 * is a relatively slow routine, so we have to try to minimize using 1186 * it. Thus it is called only upon a 'mount' or 'open'. This 1187 * is the best way of combining speed and utility, I think. 1188 * People changing diskettes in the middle of an operation deserve 1189 * to lose :-) 1190 */ 1191int check_disk_change(struct block_device *bdev) 1192{ 1193 struct gendisk *disk = bdev->bd_disk; 1194 const struct block_device_operations *bdops = disk->fops; 1195 unsigned int events; 1196 1197 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | 1198 DISK_EVENT_EJECT_REQUEST); 1199 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1200 return 0; 1201 1202 flush_disk(bdev, true); 1203 if (bdops->revalidate_disk) 1204 bdops->revalidate_disk(bdev->bd_disk); 1205 return 1; 1206} 1207 1208EXPORT_SYMBOL(check_disk_change); 1209 1210void bd_set_size(struct block_device *bdev, loff_t size) 1211{ 1212 unsigned bsize = bdev_logical_block_size(bdev); 1213 1214 inode_lock(bdev->bd_inode); 1215 i_size_write(bdev->bd_inode, size); 1216 inode_unlock(bdev->bd_inode); 1217 while (bsize < PAGE_SIZE) { 1218 if (size & bsize) 1219 break; 1220 bsize <<= 1; 1221 } 1222 bdev->bd_block_size = bsize; 1223 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1224} 1225EXPORT_SYMBOL(bd_set_size); 1226 1227static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1228 1229/* 1230 * bd_mutex locking: 1231 * 1232 * mutex_lock(part->bd_mutex) 1233 * mutex_lock_nested(whole->bd_mutex, 1) 1234 */ 1235 1236static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1237{ 1238 struct gendisk *disk; 1239 struct module *owner; 1240 int ret; 1241 int partno; 1242 int perm = 0; 1243 1244 if (mode & FMODE_READ) 1245 perm |= MAY_READ; 1246 if (mode & FMODE_WRITE) 1247 perm |= MAY_WRITE; 1248 /* 1249 * hooks: /n/, see "layering violations". 1250 */ 1251 if (!for_part) { 1252 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1253 if (ret != 0) { 1254 bdput(bdev); 1255 return ret; 1256 } 1257 } 1258 1259 restart: 1260 1261 ret = -ENXIO; 1262 disk = get_gendisk(bdev->bd_dev, &partno); 1263 if (!disk) 1264 goto out; 1265 owner = disk->fops->owner; 1266 1267 disk_block_events(disk); 1268 mutex_lock_nested(&bdev->bd_mutex, for_part); 1269 if (!bdev->bd_openers) { 1270 bdev->bd_disk = disk; 1271 bdev->bd_queue = disk->queue; 1272 bdev->bd_contains = bdev; 1273 1274 if (!partno) { 1275 ret = -ENXIO; 1276 bdev->bd_part = disk_get_part(disk, partno); 1277 if (!bdev->bd_part) 1278 goto out_clear; 1279 1280 ret = 0; 1281 if (disk->fops->open) { 1282 ret = disk->fops->open(bdev, mode); 1283 if (ret == -ERESTARTSYS) { 1284 /* Lost a race with 'disk' being 1285 * deleted, try again. 1286 * See md.c 1287 */ 1288 disk_put_part(bdev->bd_part); 1289 bdev->bd_part = NULL; 1290 bdev->bd_disk = NULL; 1291 bdev->bd_queue = NULL; 1292 mutex_unlock(&bdev->bd_mutex); 1293 disk_unblock_events(disk); 1294 put_disk(disk); 1295 module_put(owner); 1296 goto restart; 1297 } 1298 } 1299 1300 if (!ret) 1301 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1302 1303 /* 1304 * If the device is invalidated, rescan partition 1305 * if open succeeded or failed with -ENOMEDIUM. 1306 * The latter is necessary to prevent ghost 1307 * partitions on a removed medium. 1308 */ 1309 if (bdev->bd_invalidated) { 1310 if (!ret) 1311 rescan_partitions(disk, bdev); 1312 else if (ret == -ENOMEDIUM) 1313 invalidate_partitions(disk, bdev); 1314 } 1315 1316 if (ret) 1317 goto out_clear; 1318 } else { 1319 struct block_device *whole; 1320 whole = bdget_disk(disk, 0); 1321 ret = -ENOMEM; 1322 if (!whole) 1323 goto out_clear; 1324 BUG_ON(for_part); 1325 ret = __blkdev_get(whole, mode, 1); 1326 if (ret) 1327 goto out_clear; 1328 bdev->bd_contains = whole; 1329 bdev->bd_part = disk_get_part(disk, partno); 1330 if (!(disk->flags & GENHD_FL_UP) || 1331 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1332 ret = -ENXIO; 1333 goto out_clear; 1334 } 1335 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1336 } 1337 } else { 1338 if (bdev->bd_contains == bdev) { 1339 ret = 0; 1340 if (bdev->bd_disk->fops->open) 1341 ret = bdev->bd_disk->fops->open(bdev, mode); 1342 /* the same as first opener case, read comment there */ 1343 if (bdev->bd_invalidated) { 1344 if (!ret) 1345 rescan_partitions(bdev->bd_disk, bdev); 1346 else if (ret == -ENOMEDIUM) 1347 invalidate_partitions(bdev->bd_disk, bdev); 1348 } 1349 if (ret) 1350 goto out_unlock_bdev; 1351 } 1352 /* only one opener holds refs to the module and disk */ 1353 put_disk(disk); 1354 module_put(owner); 1355 } 1356 bdev->bd_openers++; 1357 if (for_part) 1358 bdev->bd_part_count++; 1359 mutex_unlock(&bdev->bd_mutex); 1360 disk_unblock_events(disk); 1361 return 0; 1362 1363 out_clear: 1364 disk_put_part(bdev->bd_part); 1365 bdev->bd_disk = NULL; 1366 bdev->bd_part = NULL; 1367 bdev->bd_queue = NULL; 1368 if (bdev != bdev->bd_contains) 1369 __blkdev_put(bdev->bd_contains, mode, 1); 1370 bdev->bd_contains = NULL; 1371 out_unlock_bdev: 1372 mutex_unlock(&bdev->bd_mutex); 1373 disk_unblock_events(disk); 1374 put_disk(disk); 1375 module_put(owner); 1376 out: 1377 bdput(bdev); 1378 1379 return ret; 1380} 1381 1382/** 1383 * blkdev_get - open a block device 1384 * @bdev: block_device to open 1385 * @mode: FMODE_* mask 1386 * @holder: exclusive holder identifier 1387 * 1388 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is 1389 * open with exclusive access. Specifying %FMODE_EXCL with %NULL 1390 * @holder is invalid. Exclusive opens may nest for the same @holder. 1391 * 1392 * On success, the reference count of @bdev is unchanged. On failure, 1393 * @bdev is put. 1394 * 1395 * CONTEXT: 1396 * Might sleep. 1397 * 1398 * RETURNS: 1399 * 0 on success, -errno on failure. 1400 */ 1401int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1402{ 1403 struct block_device *whole = NULL; 1404 int res; 1405 1406 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); 1407 1408 if ((mode & FMODE_EXCL) && holder) { 1409 whole = bd_start_claiming(bdev, holder); 1410 if (IS_ERR(whole)) { 1411 bdput(bdev); 1412 return PTR_ERR(whole); 1413 } 1414 } 1415 1416 res = __blkdev_get(bdev, mode, 0); 1417 1418 if (whole) { 1419 struct gendisk *disk = whole->bd_disk; 1420 1421 /* finish claiming */ 1422 mutex_lock(&bdev->bd_mutex); 1423 spin_lock(&bdev_lock); 1424 1425 if (!res) { 1426 BUG_ON(!bd_may_claim(bdev, whole, holder)); 1427 /* 1428 * Note that for a whole device bd_holders 1429 * will be incremented twice, and bd_holder 1430 * will be set to bd_may_claim before being 1431 * set to holder 1432 */ 1433 whole->bd_holders++; 1434 whole->bd_holder = bd_may_claim; 1435 bdev->bd_holders++; 1436 bdev->bd_holder = holder; 1437 } 1438 1439 /* tell others that we're done */ 1440 BUG_ON(whole->bd_claiming != holder); 1441 whole->bd_claiming = NULL; 1442 wake_up_bit(&whole->bd_claiming, 0); 1443 1444 spin_unlock(&bdev_lock); 1445 1446 /* 1447 * Block event polling for write claims if requested. Any 1448 * write holder makes the write_holder state stick until 1449 * all are released. This is good enough and tracking 1450 * individual writeable reference is too fragile given the 1451 * way @mode is used in blkdev_get/put(). 1452 */ 1453 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && 1454 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { 1455 bdev->bd_write_holder = true; 1456 disk_block_events(disk); 1457 } 1458 1459 mutex_unlock(&bdev->bd_mutex); 1460 bdput(whole); 1461 } 1462 1463 return res; 1464} 1465EXPORT_SYMBOL(blkdev_get); 1466 1467/** 1468 * blkdev_get_by_path - open a block device by name 1469 * @path: path to the block device to open 1470 * @mode: FMODE_* mask 1471 * @holder: exclusive holder identifier 1472 * 1473 * Open the blockdevice described by the device file at @path. @mode 1474 * and @holder are identical to blkdev_get(). 1475 * 1476 * On success, the returned block_device has reference count of one. 1477 * 1478 * CONTEXT: 1479 * Might sleep. 1480 * 1481 * RETURNS: 1482 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1483 */ 1484struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 1485 void *holder) 1486{ 1487 struct block_device *bdev; 1488 int err; 1489 1490 bdev = lookup_bdev(path); 1491 if (IS_ERR(bdev)) 1492 return bdev; 1493 1494 err = blkdev_get(bdev, mode, holder); 1495 if (err) 1496 return ERR_PTR(err); 1497 1498 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { 1499 blkdev_put(bdev, mode); 1500 return ERR_PTR(-EACCES); 1501 } 1502 1503 return bdev; 1504} 1505EXPORT_SYMBOL(blkdev_get_by_path); 1506 1507/** 1508 * blkdev_get_by_dev - open a block device by device number 1509 * @dev: device number of block device to open 1510 * @mode: FMODE_* mask 1511 * @holder: exclusive holder identifier 1512 * 1513 * Open the blockdevice described by device number @dev. @mode and 1514 * @holder are identical to blkdev_get(). 1515 * 1516 * Use it ONLY if you really do not have anything better - i.e. when 1517 * you are behind a truly sucky interface and all you are given is a 1518 * device number. _Never_ to be used for internal purposes. If you 1519 * ever need it - reconsider your API. 1520 * 1521 * On success, the returned block_device has reference count of one. 1522 * 1523 * CONTEXT: 1524 * Might sleep. 1525 * 1526 * RETURNS: 1527 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1528 */ 1529struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) 1530{ 1531 struct block_device *bdev; 1532 int err; 1533 1534 bdev = bdget(dev); 1535 if (!bdev) 1536 return ERR_PTR(-ENOMEM); 1537 1538 err = blkdev_get(bdev, mode, holder); 1539 if (err) 1540 return ERR_PTR(err); 1541 1542 return bdev; 1543} 1544EXPORT_SYMBOL(blkdev_get_by_dev); 1545 1546static int blkdev_open(struct inode * inode, struct file * filp) 1547{ 1548 struct block_device *bdev; 1549 1550 /* 1551 * Preserve backwards compatibility and allow large file access 1552 * even if userspace doesn't ask for it explicitly. Some mkfs 1553 * binary needs it. We might want to drop this workaround 1554 * during an unstable branch. 1555 */ 1556 filp->f_flags |= O_LARGEFILE; 1557 1558 if (filp->f_flags & O_NDELAY) 1559 filp->f_mode |= FMODE_NDELAY; 1560 if (filp->f_flags & O_EXCL) 1561 filp->f_mode |= FMODE_EXCL; 1562 if ((filp->f_flags & O_ACCMODE) == 3) 1563 filp->f_mode |= FMODE_WRITE_IOCTL; 1564 1565 bdev = bd_acquire(inode); 1566 if (bdev == NULL) 1567 return -ENOMEM; 1568 1569 filp->f_mapping = bdev->bd_inode->i_mapping; 1570 1571 return blkdev_get(bdev, filp->f_mode, filp); 1572} 1573 1574static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1575{ 1576 struct gendisk *disk = bdev->bd_disk; 1577 struct block_device *victim = NULL; 1578 1579 mutex_lock_nested(&bdev->bd_mutex, for_part); 1580 if (for_part) 1581 bdev->bd_part_count--; 1582 1583 if (!--bdev->bd_openers) { 1584 WARN_ON_ONCE(bdev->bd_holders); 1585 sync_blockdev(bdev); 1586 kill_bdev(bdev); 1587 1588 bdev_write_inode(bdev); 1589 /* 1590 * Detaching bdev inode from its wb in __destroy_inode() 1591 * is too late: the queue which embeds its bdi (along with 1592 * root wb) can be gone as soon as we put_disk() below. 1593 */ 1594 inode_detach_wb(bdev->bd_inode); 1595 } 1596 if (bdev->bd_contains == bdev) { 1597 if (disk->fops->release) 1598 disk->fops->release(disk, mode); 1599 } 1600 if (!bdev->bd_openers) { 1601 struct module *owner = disk->fops->owner; 1602 1603 disk_put_part(bdev->bd_part); 1604 bdev->bd_part = NULL; 1605 bdev->bd_disk = NULL; 1606 if (bdev != bdev->bd_contains) 1607 victim = bdev->bd_contains; 1608 bdev->bd_contains = NULL; 1609 1610 put_disk(disk); 1611 module_put(owner); 1612 } 1613 mutex_unlock(&bdev->bd_mutex); 1614 bdput(bdev); 1615 if (victim) 1616 __blkdev_put(victim, mode, 1); 1617} 1618 1619void blkdev_put(struct block_device *bdev, fmode_t mode) 1620{ 1621 mutex_lock(&bdev->bd_mutex); 1622 1623 if (mode & FMODE_EXCL) { 1624 bool bdev_free; 1625 1626 /* 1627 * Release a claim on the device. The holder fields 1628 * are protected with bdev_lock. bd_mutex is to 1629 * synchronize disk_holder unlinking. 1630 */ 1631 spin_lock(&bdev_lock); 1632 1633 WARN_ON_ONCE(--bdev->bd_holders < 0); 1634 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); 1635 1636 /* bd_contains might point to self, check in a separate step */ 1637 if ((bdev_free = !bdev->bd_holders)) 1638 bdev->bd_holder = NULL; 1639 if (!bdev->bd_contains->bd_holders) 1640 bdev->bd_contains->bd_holder = NULL; 1641 1642 spin_unlock(&bdev_lock); 1643 1644 /* 1645 * If this was the last claim, remove holder link and 1646 * unblock evpoll if it was a write holder. 1647 */ 1648 if (bdev_free && bdev->bd_write_holder) { 1649 disk_unblock_events(bdev->bd_disk); 1650 bdev->bd_write_holder = false; 1651 } 1652 } 1653 1654 /* 1655 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1656 * event. This is to ensure detection of media removal commanded 1657 * from userland - e.g. eject(1). 1658 */ 1659 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); 1660 1661 mutex_unlock(&bdev->bd_mutex); 1662 1663 __blkdev_put(bdev, mode, 0); 1664} 1665EXPORT_SYMBOL(blkdev_put); 1666 1667static int blkdev_close(struct inode * inode, struct file * filp) 1668{ 1669 struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); 1670 blkdev_put(bdev, filp->f_mode); 1671 return 0; 1672} 1673 1674static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1675{ 1676 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 1677 fmode_t mode = file->f_mode; 1678 1679 /* 1680 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1681 * to updated it before every ioctl. 1682 */ 1683 if (file->f_flags & O_NDELAY) 1684 mode |= FMODE_NDELAY; 1685 else 1686 mode &= ~FMODE_NDELAY; 1687 1688 return blkdev_ioctl(bdev, mode, cmd, arg); 1689} 1690 1691/* 1692 * Write data to the block device. Only intended for the block device itself 1693 * and the raw driver which basically is a fake block device. 1694 * 1695 * Does not take i_mutex for the write and thus is not for general purpose 1696 * use. 1697 */ 1698ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 1699{ 1700 struct file *file = iocb->ki_filp; 1701 struct inode *bd_inode = bdev_file_inode(file); 1702 loff_t size = i_size_read(bd_inode); 1703 struct blk_plug plug; 1704 ssize_t ret; 1705 1706 if (bdev_read_only(I_BDEV(bd_inode))) 1707 return -EPERM; 1708 1709 if (!iov_iter_count(from)) 1710 return 0; 1711 1712 if (iocb->ki_pos >= size) 1713 return -ENOSPC; 1714 1715 iov_iter_truncate(from, size - iocb->ki_pos); 1716 1717 blk_start_plug(&plug); 1718 ret = __generic_file_write_iter(iocb, from); 1719 if (ret > 0) 1720 ret = generic_write_sync(iocb, ret); 1721 blk_finish_plug(&plug); 1722 return ret; 1723} 1724EXPORT_SYMBOL_GPL(blkdev_write_iter); 1725 1726ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 1727{ 1728 struct file *file = iocb->ki_filp; 1729 struct inode *bd_inode = bdev_file_inode(file); 1730 loff_t size = i_size_read(bd_inode); 1731 loff_t pos = iocb->ki_pos; 1732 1733 if (pos >= size) 1734 return 0; 1735 1736 size -= pos; 1737 iov_iter_truncate(to, size); 1738 return generic_file_read_iter(iocb, to); 1739} 1740EXPORT_SYMBOL_GPL(blkdev_read_iter); 1741 1742/* 1743 * Try to release a page associated with block device when the system 1744 * is under memory pressure. 1745 */ 1746static int blkdev_releasepage(struct page *page, gfp_t wait) 1747{ 1748 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1749 1750 if (super && super->s_op->bdev_try_to_free_page) 1751 return super->s_op->bdev_try_to_free_page(super, page, wait); 1752 1753 return try_to_free_buffers(page); 1754} 1755 1756static int blkdev_writepages(struct address_space *mapping, 1757 struct writeback_control *wbc) 1758{ 1759 if (dax_mapping(mapping)) { 1760 struct block_device *bdev = I_BDEV(mapping->host); 1761 1762 return dax_writeback_mapping_range(mapping, bdev, wbc); 1763 } 1764 return generic_writepages(mapping, wbc); 1765} 1766 1767static const struct address_space_operations def_blk_aops = { 1768 .readpage = blkdev_readpage, 1769 .readpages = blkdev_readpages, 1770 .writepage = blkdev_writepage, 1771 .write_begin = blkdev_write_begin, 1772 .write_end = blkdev_write_end, 1773 .writepages = blkdev_writepages, 1774 .releasepage = blkdev_releasepage, 1775 .direct_IO = blkdev_direct_IO, 1776 .is_dirty_writeback = buffer_check_dirty_writeback, 1777}; 1778 1779#define BLKDEV_FALLOC_FL_SUPPORTED \ 1780 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 1781 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) 1782 1783static long blkdev_fallocate(struct file *file, int mode, loff_t start, 1784 loff_t len) 1785{ 1786 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 1787 struct request_queue *q = bdev_get_queue(bdev); 1788 struct address_space *mapping; 1789 loff_t end = start + len - 1; 1790 loff_t isize; 1791 int error; 1792 1793 /* Fail if we don't recognize the flags. */ 1794 if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) 1795 return -EOPNOTSUPP; 1796 1797 /* Don't go off the end of the device. */ 1798 isize = i_size_read(bdev->bd_inode); 1799 if (start >= isize) 1800 return -EINVAL; 1801 if (end >= isize) { 1802 if (mode & FALLOC_FL_KEEP_SIZE) { 1803 len = isize - start; 1804 end = start + len - 1; 1805 } else 1806 return -EINVAL; 1807 } 1808 1809 /* 1810 * Don't allow IO that isn't aligned to logical block size. 1811 */ 1812 if ((start | len) & (bdev_logical_block_size(bdev) - 1)) 1813 return -EINVAL; 1814 1815 /* Invalidate the page cache, including dirty pages. */ 1816 mapping = bdev->bd_inode->i_mapping; 1817 truncate_inode_pages_range(mapping, start, end); 1818 1819 switch (mode) { 1820 case FALLOC_FL_ZERO_RANGE: 1821 case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: 1822 error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, 1823 GFP_KERNEL, false); 1824 break; 1825 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: 1826 /* Only punch if the device can do zeroing discard. */ 1827 if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) 1828 return -EOPNOTSUPP; 1829 error = blkdev_issue_discard(bdev, start >> 9, len >> 9, 1830 GFP_KERNEL, 0); 1831 break; 1832 case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: 1833 if (!blk_queue_discard(q)) 1834 return -EOPNOTSUPP; 1835 error = blkdev_issue_discard(bdev, start >> 9, len >> 9, 1836 GFP_KERNEL, 0); 1837 break; 1838 default: 1839 return -EOPNOTSUPP; 1840 } 1841 if (error) 1842 return error; 1843 1844 /* 1845 * Invalidate again; if someone wandered in and dirtied a page, 1846 * the caller will be given -EBUSY. The third argument is 1847 * inclusive, so the rounding here is safe. 1848 */ 1849 return invalidate_inode_pages2_range(mapping, 1850 start >> PAGE_SHIFT, 1851 end >> PAGE_SHIFT); 1852} 1853 1854const struct file_operations def_blk_fops = { 1855 .open = blkdev_open, 1856 .release = blkdev_close, 1857 .llseek = block_llseek, 1858 .read_iter = blkdev_read_iter, 1859 .write_iter = blkdev_write_iter, 1860 .mmap = generic_file_mmap, 1861 .fsync = blkdev_fsync, 1862 .unlocked_ioctl = block_ioctl, 1863#ifdef CONFIG_COMPAT 1864 .compat_ioctl = compat_blkdev_ioctl, 1865#endif 1866 .splice_read = generic_file_splice_read, 1867 .splice_write = iter_file_splice_write, 1868 .fallocate = blkdev_fallocate, 1869}; 1870 1871int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1872{ 1873 int res; 1874 mm_segment_t old_fs = get_fs(); 1875 set_fs(KERNEL_DS); 1876 res = blkdev_ioctl(bdev, 0, cmd, arg); 1877 set_fs(old_fs); 1878 return res; 1879} 1880 1881EXPORT_SYMBOL(ioctl_by_bdev); 1882 1883/** 1884 * lookup_bdev - lookup a struct block_device by name 1885 * @pathname: special file representing the block device 1886 * 1887 * Get a reference to the blockdevice at @pathname in the current 1888 * namespace if possible and return it. Return ERR_PTR(error) 1889 * otherwise. 1890 */ 1891struct block_device *lookup_bdev(const char *pathname) 1892{ 1893 struct block_device *bdev; 1894 struct inode *inode; 1895 struct path path; 1896 int error; 1897 1898 if (!pathname || !*pathname) 1899 return ERR_PTR(-EINVAL); 1900 1901 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1902 if (error) 1903 return ERR_PTR(error); 1904 1905 inode = d_backing_inode(path.dentry); 1906 error = -ENOTBLK; 1907 if (!S_ISBLK(inode->i_mode)) 1908 goto fail; 1909 error = -EACCES; 1910 if (!may_open_dev(&path)) 1911 goto fail; 1912 error = -ENOMEM; 1913 bdev = bd_acquire(inode); 1914 if (!bdev) 1915 goto fail; 1916out: 1917 path_put(&path); 1918 return bdev; 1919fail: 1920 bdev = ERR_PTR(error); 1921 goto out; 1922} 1923EXPORT_SYMBOL(lookup_bdev); 1924 1925int __invalidate_device(struct block_device *bdev, bool kill_dirty) 1926{ 1927 struct super_block *sb = get_super(bdev); 1928 int res = 0; 1929 1930 if (sb) { 1931 /* 1932 * no need to lock the super, get_super holds the 1933 * read mutex so the filesystem cannot go away 1934 * under us (->put_super runs with the write lock 1935 * hold). 1936 */ 1937 shrink_dcache_sb(sb); 1938 res = invalidate_inodes(sb, kill_dirty); 1939 drop_super(sb); 1940 } 1941 invalidate_bdev(bdev); 1942 return res; 1943} 1944EXPORT_SYMBOL(__invalidate_device); 1945 1946void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) 1947{ 1948 struct inode *inode, *old_inode = NULL; 1949 1950 spin_lock(&blockdev_superblock->s_inode_list_lock); 1951 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1952 struct address_space *mapping = inode->i_mapping; 1953 1954 spin_lock(&inode->i_lock); 1955 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1956 mapping->nrpages == 0) { 1957 spin_unlock(&inode->i_lock); 1958 continue; 1959 } 1960 __iget(inode); 1961 spin_unlock(&inode->i_lock); 1962 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1963 /* 1964 * We hold a reference to 'inode' so it couldn't have been 1965 * removed from s_inodes list while we dropped the 1966 * s_inode_list_lock We cannot iput the inode now as we can 1967 * be holding the last reference and we cannot iput it under 1968 * s_inode_list_lock. So we keep the reference and iput it 1969 * later. 1970 */ 1971 iput(old_inode); 1972 old_inode = inode; 1973 1974 func(I_BDEV(inode), arg); 1975 1976 spin_lock(&blockdev_superblock->s_inode_list_lock); 1977 } 1978 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1979 iput(old_inode); 1980}