at v4.6 50 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/device_cgroup.h> 15#include <linux/highmem.h> 16#include <linux/blkdev.h> 17#include <linux/backing-dev.h> 18#include <linux/module.h> 19#include <linux/blkpg.h> 20#include <linux/magic.h> 21#include <linux/buffer_head.h> 22#include <linux/swap.h> 23#include <linux/pagevec.h> 24#include <linux/writeback.h> 25#include <linux/mpage.h> 26#include <linux/mount.h> 27#include <linux/uio.h> 28#include <linux/namei.h> 29#include <linux/log2.h> 30#include <linux/cleancache.h> 31#include <linux/dax.h> 32#include <asm/uaccess.h> 33#include "internal.h" 34 35struct bdev_inode { 36 struct block_device bdev; 37 struct inode vfs_inode; 38}; 39 40static const struct address_space_operations def_blk_aops; 41 42static inline struct bdev_inode *BDEV_I(struct inode *inode) 43{ 44 return container_of(inode, struct bdev_inode, vfs_inode); 45} 46 47struct block_device *I_BDEV(struct inode *inode) 48{ 49 return &BDEV_I(inode)->bdev; 50} 51EXPORT_SYMBOL(I_BDEV); 52 53static void bdev_write_inode(struct block_device *bdev) 54{ 55 struct inode *inode = bdev->bd_inode; 56 int ret; 57 58 spin_lock(&inode->i_lock); 59 while (inode->i_state & I_DIRTY) { 60 spin_unlock(&inode->i_lock); 61 ret = write_inode_now(inode, true); 62 if (ret) { 63 char name[BDEVNAME_SIZE]; 64 pr_warn_ratelimited("VFS: Dirty inode writeback failed " 65 "for block device %s (err=%d).\n", 66 bdevname(bdev, name), ret); 67 } 68 spin_lock(&inode->i_lock); 69 } 70 spin_unlock(&inode->i_lock); 71} 72 73/* Kill _all_ buffers and pagecache , dirty or not.. */ 74void kill_bdev(struct block_device *bdev) 75{ 76 struct address_space *mapping = bdev->bd_inode->i_mapping; 77 78 if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 79 return; 80 81 invalidate_bh_lrus(); 82 truncate_inode_pages(mapping, 0); 83} 84EXPORT_SYMBOL(kill_bdev); 85 86/* Invalidate clean unused buffers and pagecache. */ 87void invalidate_bdev(struct block_device *bdev) 88{ 89 struct address_space *mapping = bdev->bd_inode->i_mapping; 90 91 if (mapping->nrpages == 0) 92 return; 93 94 invalidate_bh_lrus(); 95 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 96 invalidate_mapping_pages(mapping, 0, -1); 97 /* 99% of the time, we don't need to flush the cleancache on the bdev. 98 * But, for the strange corners, lets be cautious 99 */ 100 cleancache_invalidate_inode(mapping); 101} 102EXPORT_SYMBOL(invalidate_bdev); 103 104int set_blocksize(struct block_device *bdev, int size) 105{ 106 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 107 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 108 return -EINVAL; 109 110 /* Size cannot be smaller than the size supported by the device */ 111 if (size < bdev_logical_block_size(bdev)) 112 return -EINVAL; 113 114 /* Don't change the size if it is same as current */ 115 if (bdev->bd_block_size != size) { 116 sync_blockdev(bdev); 117 bdev->bd_block_size = size; 118 bdev->bd_inode->i_blkbits = blksize_bits(size); 119 kill_bdev(bdev); 120 } 121 return 0; 122} 123 124EXPORT_SYMBOL(set_blocksize); 125 126int sb_set_blocksize(struct super_block *sb, int size) 127{ 128 if (set_blocksize(sb->s_bdev, size)) 129 return 0; 130 /* If we get here, we know size is power of two 131 * and it's value is between 512 and PAGE_SIZE */ 132 sb->s_blocksize = size; 133 sb->s_blocksize_bits = blksize_bits(size); 134 return sb->s_blocksize; 135} 136 137EXPORT_SYMBOL(sb_set_blocksize); 138 139int sb_min_blocksize(struct super_block *sb, int size) 140{ 141 int minsize = bdev_logical_block_size(sb->s_bdev); 142 if (size < minsize) 143 size = minsize; 144 return sb_set_blocksize(sb, size); 145} 146 147EXPORT_SYMBOL(sb_min_blocksize); 148 149static int 150blkdev_get_block(struct inode *inode, sector_t iblock, 151 struct buffer_head *bh, int create) 152{ 153 bh->b_bdev = I_BDEV(inode); 154 bh->b_blocknr = iblock; 155 set_buffer_mapped(bh); 156 return 0; 157} 158 159static struct inode *bdev_file_inode(struct file *file) 160{ 161 return file->f_mapping->host; 162} 163 164static ssize_t 165blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) 166{ 167 struct file *file = iocb->ki_filp; 168 struct inode *inode = bdev_file_inode(file); 169 170 if (IS_DAX(inode)) 171 return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, 172 NULL, DIO_SKIP_DIO_COUNT); 173 return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, 174 blkdev_get_block, NULL, NULL, 175 DIO_SKIP_DIO_COUNT); 176} 177 178int __sync_blockdev(struct block_device *bdev, int wait) 179{ 180 if (!bdev) 181 return 0; 182 if (!wait) 183 return filemap_flush(bdev->bd_inode->i_mapping); 184 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 185} 186 187/* 188 * Write out and wait upon all the dirty data associated with a block 189 * device via its mapping. Does not take the superblock lock. 190 */ 191int sync_blockdev(struct block_device *bdev) 192{ 193 return __sync_blockdev(bdev, 1); 194} 195EXPORT_SYMBOL(sync_blockdev); 196 197/* 198 * Write out and wait upon all dirty data associated with this 199 * device. Filesystem data as well as the underlying block 200 * device. Takes the superblock lock. 201 */ 202int fsync_bdev(struct block_device *bdev) 203{ 204 struct super_block *sb = get_super(bdev); 205 if (sb) { 206 int res = sync_filesystem(sb); 207 drop_super(sb); 208 return res; 209 } 210 return sync_blockdev(bdev); 211} 212EXPORT_SYMBOL(fsync_bdev); 213 214/** 215 * freeze_bdev -- lock a filesystem and force it into a consistent state 216 * @bdev: blockdevice to lock 217 * 218 * If a superblock is found on this device, we take the s_umount semaphore 219 * on it to make sure nobody unmounts until the snapshot creation is done. 220 * The reference counter (bd_fsfreeze_count) guarantees that only the last 221 * unfreeze process can unfreeze the frozen filesystem actually when multiple 222 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 223 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 224 * actually. 225 */ 226struct super_block *freeze_bdev(struct block_device *bdev) 227{ 228 struct super_block *sb; 229 int error = 0; 230 231 mutex_lock(&bdev->bd_fsfreeze_mutex); 232 if (++bdev->bd_fsfreeze_count > 1) { 233 /* 234 * We don't even need to grab a reference - the first call 235 * to freeze_bdev grab an active reference and only the last 236 * thaw_bdev drops it. 237 */ 238 sb = get_super(bdev); 239 drop_super(sb); 240 mutex_unlock(&bdev->bd_fsfreeze_mutex); 241 return sb; 242 } 243 244 sb = get_active_super(bdev); 245 if (!sb) 246 goto out; 247 if (sb->s_op->freeze_super) 248 error = sb->s_op->freeze_super(sb); 249 else 250 error = freeze_super(sb); 251 if (error) { 252 deactivate_super(sb); 253 bdev->bd_fsfreeze_count--; 254 mutex_unlock(&bdev->bd_fsfreeze_mutex); 255 return ERR_PTR(error); 256 } 257 deactivate_super(sb); 258 out: 259 sync_blockdev(bdev); 260 mutex_unlock(&bdev->bd_fsfreeze_mutex); 261 return sb; /* thaw_bdev releases s->s_umount */ 262} 263EXPORT_SYMBOL(freeze_bdev); 264 265/** 266 * thaw_bdev -- unlock filesystem 267 * @bdev: blockdevice to unlock 268 * @sb: associated superblock 269 * 270 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 271 */ 272int thaw_bdev(struct block_device *bdev, struct super_block *sb) 273{ 274 int error = -EINVAL; 275 276 mutex_lock(&bdev->bd_fsfreeze_mutex); 277 if (!bdev->bd_fsfreeze_count) 278 goto out; 279 280 error = 0; 281 if (--bdev->bd_fsfreeze_count > 0) 282 goto out; 283 284 if (!sb) 285 goto out; 286 287 if (sb->s_op->thaw_super) 288 error = sb->s_op->thaw_super(sb); 289 else 290 error = thaw_super(sb); 291 if (error) { 292 bdev->bd_fsfreeze_count++; 293 mutex_unlock(&bdev->bd_fsfreeze_mutex); 294 return error; 295 } 296out: 297 mutex_unlock(&bdev->bd_fsfreeze_mutex); 298 return 0; 299} 300EXPORT_SYMBOL(thaw_bdev); 301 302static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 303{ 304 return block_write_full_page(page, blkdev_get_block, wbc); 305} 306 307static int blkdev_readpage(struct file * file, struct page * page) 308{ 309 return block_read_full_page(page, blkdev_get_block); 310} 311 312static int blkdev_readpages(struct file *file, struct address_space *mapping, 313 struct list_head *pages, unsigned nr_pages) 314{ 315 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); 316} 317 318static int blkdev_write_begin(struct file *file, struct address_space *mapping, 319 loff_t pos, unsigned len, unsigned flags, 320 struct page **pagep, void **fsdata) 321{ 322 return block_write_begin(mapping, pos, len, flags, pagep, 323 blkdev_get_block); 324} 325 326static int blkdev_write_end(struct file *file, struct address_space *mapping, 327 loff_t pos, unsigned len, unsigned copied, 328 struct page *page, void *fsdata) 329{ 330 int ret; 331 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 332 333 unlock_page(page); 334 put_page(page); 335 336 return ret; 337} 338 339/* 340 * private llseek: 341 * for a block special file file_inode(file)->i_size is zero 342 * so we compute the size by hand (just as in block_read/write above) 343 */ 344static loff_t block_llseek(struct file *file, loff_t offset, int whence) 345{ 346 struct inode *bd_inode = bdev_file_inode(file); 347 loff_t retval; 348 349 inode_lock(bd_inode); 350 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); 351 inode_unlock(bd_inode); 352 return retval; 353} 354 355int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 356{ 357 struct inode *bd_inode = bdev_file_inode(filp); 358 struct block_device *bdev = I_BDEV(bd_inode); 359 int error; 360 361 error = filemap_write_and_wait_range(filp->f_mapping, start, end); 362 if (error) 363 return error; 364 365 /* 366 * There is no need to serialise calls to blkdev_issue_flush with 367 * i_mutex and doing so causes performance issues with concurrent 368 * O_SYNC writers to a block device. 369 */ 370 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); 371 if (error == -EOPNOTSUPP) 372 error = 0; 373 374 return error; 375} 376EXPORT_SYMBOL(blkdev_fsync); 377 378/** 379 * bdev_read_page() - Start reading a page from a block device 380 * @bdev: The device to read the page from 381 * @sector: The offset on the device to read the page to (need not be aligned) 382 * @page: The page to read 383 * 384 * On entry, the page should be locked. It will be unlocked when the page 385 * has been read. If the block driver implements rw_page synchronously, 386 * that will be true on exit from this function, but it need not be. 387 * 388 * Errors returned by this function are usually "soft", eg out of memory, or 389 * queue full; callers should try a different route to read this page rather 390 * than propagate an error back up the stack. 391 * 392 * Return: negative errno if an error occurs, 0 if submission was successful. 393 */ 394int bdev_read_page(struct block_device *bdev, sector_t sector, 395 struct page *page) 396{ 397 const struct block_device_operations *ops = bdev->bd_disk->fops; 398 int result = -EOPNOTSUPP; 399 400 if (!ops->rw_page || bdev_get_integrity(bdev)) 401 return result; 402 403 result = blk_queue_enter(bdev->bd_queue, false); 404 if (result) 405 return result; 406 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); 407 blk_queue_exit(bdev->bd_queue); 408 return result; 409} 410EXPORT_SYMBOL_GPL(bdev_read_page); 411 412/** 413 * bdev_write_page() - Start writing a page to a block device 414 * @bdev: The device to write the page to 415 * @sector: The offset on the device to write the page to (need not be aligned) 416 * @page: The page to write 417 * @wbc: The writeback_control for the write 418 * 419 * On entry, the page should be locked and not currently under writeback. 420 * On exit, if the write started successfully, the page will be unlocked and 421 * under writeback. If the write failed already (eg the driver failed to 422 * queue the page to the device), the page will still be locked. If the 423 * caller is a ->writepage implementation, it will need to unlock the page. 424 * 425 * Errors returned by this function are usually "soft", eg out of memory, or 426 * queue full; callers should try a different route to write this page rather 427 * than propagate an error back up the stack. 428 * 429 * Return: negative errno if an error occurs, 0 if submission was successful. 430 */ 431int bdev_write_page(struct block_device *bdev, sector_t sector, 432 struct page *page, struct writeback_control *wbc) 433{ 434 int result; 435 int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; 436 const struct block_device_operations *ops = bdev->bd_disk->fops; 437 438 if (!ops->rw_page || bdev_get_integrity(bdev)) 439 return -EOPNOTSUPP; 440 result = blk_queue_enter(bdev->bd_queue, false); 441 if (result) 442 return result; 443 444 set_page_writeback(page); 445 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); 446 if (result) 447 end_page_writeback(page); 448 else 449 unlock_page(page); 450 blk_queue_exit(bdev->bd_queue); 451 return result; 452} 453EXPORT_SYMBOL_GPL(bdev_write_page); 454 455/** 456 * bdev_direct_access() - Get the address for directly-accessibly memory 457 * @bdev: The device containing the memory 458 * @dax: control and output parameters for ->direct_access 459 * 460 * If a block device is made up of directly addressable memory, this function 461 * will tell the caller the PFN and the address of the memory. The address 462 * may be directly dereferenced within the kernel without the need to call 463 * ioremap(), kmap() or similar. The PFN is suitable for inserting into 464 * page tables. 465 * 466 * Return: negative errno if an error occurs, otherwise the number of bytes 467 * accessible at this address. 468 */ 469long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) 470{ 471 sector_t sector = dax->sector; 472 long avail, size = dax->size; 473 const struct block_device_operations *ops = bdev->bd_disk->fops; 474 475 /* 476 * The device driver is allowed to sleep, in order to make the 477 * memory directly accessible. 478 */ 479 might_sleep(); 480 481 if (size < 0) 482 return size; 483 if (!ops->direct_access) 484 return -EOPNOTSUPP; 485 if ((sector + DIV_ROUND_UP(size, 512)) > 486 part_nr_sects_read(bdev->bd_part)) 487 return -ERANGE; 488 sector += get_start_sect(bdev); 489 if (sector % (PAGE_SIZE / 512)) 490 return -EINVAL; 491 avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); 492 if (!avail) 493 return -ERANGE; 494 if (avail > 0 && avail & ~PAGE_MASK) 495 return -ENXIO; 496 return min(avail, size); 497} 498EXPORT_SYMBOL_GPL(bdev_direct_access); 499 500/* 501 * pseudo-fs 502 */ 503 504static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 505static struct kmem_cache * bdev_cachep __read_mostly; 506 507static struct inode *bdev_alloc_inode(struct super_block *sb) 508{ 509 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 510 if (!ei) 511 return NULL; 512 return &ei->vfs_inode; 513} 514 515static void bdev_i_callback(struct rcu_head *head) 516{ 517 struct inode *inode = container_of(head, struct inode, i_rcu); 518 struct bdev_inode *bdi = BDEV_I(inode); 519 520 kmem_cache_free(bdev_cachep, bdi); 521} 522 523static void bdev_destroy_inode(struct inode *inode) 524{ 525 call_rcu(&inode->i_rcu, bdev_i_callback); 526} 527 528static void init_once(void *foo) 529{ 530 struct bdev_inode *ei = (struct bdev_inode *) foo; 531 struct block_device *bdev = &ei->bdev; 532 533 memset(bdev, 0, sizeof(*bdev)); 534 mutex_init(&bdev->bd_mutex); 535 INIT_LIST_HEAD(&bdev->bd_inodes); 536 INIT_LIST_HEAD(&bdev->bd_list); 537#ifdef CONFIG_SYSFS 538 INIT_LIST_HEAD(&bdev->bd_holder_disks); 539#endif 540 inode_init_once(&ei->vfs_inode); 541 /* Initialize mutex for freeze. */ 542 mutex_init(&bdev->bd_fsfreeze_mutex); 543} 544 545static inline void __bd_forget(struct inode *inode) 546{ 547 list_del_init(&inode->i_devices); 548 inode->i_bdev = NULL; 549 inode->i_mapping = &inode->i_data; 550} 551 552static void bdev_evict_inode(struct inode *inode) 553{ 554 struct block_device *bdev = &BDEV_I(inode)->bdev; 555 struct list_head *p; 556 truncate_inode_pages_final(&inode->i_data); 557 invalidate_inode_buffers(inode); /* is it needed here? */ 558 clear_inode(inode); 559 spin_lock(&bdev_lock); 560 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 561 __bd_forget(list_entry(p, struct inode, i_devices)); 562 } 563 list_del_init(&bdev->bd_list); 564 spin_unlock(&bdev_lock); 565} 566 567static const struct super_operations bdev_sops = { 568 .statfs = simple_statfs, 569 .alloc_inode = bdev_alloc_inode, 570 .destroy_inode = bdev_destroy_inode, 571 .drop_inode = generic_delete_inode, 572 .evict_inode = bdev_evict_inode, 573}; 574 575static struct dentry *bd_mount(struct file_system_type *fs_type, 576 int flags, const char *dev_name, void *data) 577{ 578 struct dentry *dent; 579 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); 580 if (dent) 581 dent->d_sb->s_iflags |= SB_I_CGROUPWB; 582 return dent; 583} 584 585static struct file_system_type bd_type = { 586 .name = "bdev", 587 .mount = bd_mount, 588 .kill_sb = kill_anon_super, 589}; 590 591struct super_block *blockdev_superblock __read_mostly; 592EXPORT_SYMBOL_GPL(blockdev_superblock); 593 594void __init bdev_cache_init(void) 595{ 596 int err; 597 static struct vfsmount *bd_mnt; 598 599 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 600 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 601 SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), 602 init_once); 603 err = register_filesystem(&bd_type); 604 if (err) 605 panic("Cannot register bdev pseudo-fs"); 606 bd_mnt = kern_mount(&bd_type); 607 if (IS_ERR(bd_mnt)) 608 panic("Cannot create bdev pseudo-fs"); 609 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 610} 611 612/* 613 * Most likely _very_ bad one - but then it's hardly critical for small 614 * /dev and can be fixed when somebody will need really large one. 615 * Keep in mind that it will be fed through icache hash function too. 616 */ 617static inline unsigned long hash(dev_t dev) 618{ 619 return MAJOR(dev)+MINOR(dev); 620} 621 622static int bdev_test(struct inode *inode, void *data) 623{ 624 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 625} 626 627static int bdev_set(struct inode *inode, void *data) 628{ 629 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 630 return 0; 631} 632 633static LIST_HEAD(all_bdevs); 634 635struct block_device *bdget(dev_t dev) 636{ 637 struct block_device *bdev; 638 struct inode *inode; 639 640 inode = iget5_locked(blockdev_superblock, hash(dev), 641 bdev_test, bdev_set, &dev); 642 643 if (!inode) 644 return NULL; 645 646 bdev = &BDEV_I(inode)->bdev; 647 648 if (inode->i_state & I_NEW) { 649 bdev->bd_contains = NULL; 650 bdev->bd_super = NULL; 651 bdev->bd_inode = inode; 652 bdev->bd_block_size = (1 << inode->i_blkbits); 653 bdev->bd_part_count = 0; 654 bdev->bd_invalidated = 0; 655 inode->i_mode = S_IFBLK; 656 inode->i_rdev = dev; 657 inode->i_bdev = bdev; 658 inode->i_data.a_ops = &def_blk_aops; 659 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 660 spin_lock(&bdev_lock); 661 list_add(&bdev->bd_list, &all_bdevs); 662 spin_unlock(&bdev_lock); 663 unlock_new_inode(inode); 664 } 665 return bdev; 666} 667 668EXPORT_SYMBOL(bdget); 669 670/** 671 * bdgrab -- Grab a reference to an already referenced block device 672 * @bdev: Block device to grab a reference to. 673 */ 674struct block_device *bdgrab(struct block_device *bdev) 675{ 676 ihold(bdev->bd_inode); 677 return bdev; 678} 679EXPORT_SYMBOL(bdgrab); 680 681long nr_blockdev_pages(void) 682{ 683 struct block_device *bdev; 684 long ret = 0; 685 spin_lock(&bdev_lock); 686 list_for_each_entry(bdev, &all_bdevs, bd_list) { 687 ret += bdev->bd_inode->i_mapping->nrpages; 688 } 689 spin_unlock(&bdev_lock); 690 return ret; 691} 692 693void bdput(struct block_device *bdev) 694{ 695 iput(bdev->bd_inode); 696} 697 698EXPORT_SYMBOL(bdput); 699 700static struct block_device *bd_acquire(struct inode *inode) 701{ 702 struct block_device *bdev; 703 704 spin_lock(&bdev_lock); 705 bdev = inode->i_bdev; 706 if (bdev) { 707 bdgrab(bdev); 708 spin_unlock(&bdev_lock); 709 return bdev; 710 } 711 spin_unlock(&bdev_lock); 712 713 bdev = bdget(inode->i_rdev); 714 if (bdev) { 715 spin_lock(&bdev_lock); 716 if (!inode->i_bdev) { 717 /* 718 * We take an additional reference to bd_inode, 719 * and it's released in clear_inode() of inode. 720 * So, we can access it via ->i_mapping always 721 * without igrab(). 722 */ 723 bdgrab(bdev); 724 inode->i_bdev = bdev; 725 inode->i_mapping = bdev->bd_inode->i_mapping; 726 list_add(&inode->i_devices, &bdev->bd_inodes); 727 } 728 spin_unlock(&bdev_lock); 729 } 730 return bdev; 731} 732 733/* Call when you free inode */ 734 735void bd_forget(struct inode *inode) 736{ 737 struct block_device *bdev = NULL; 738 739 spin_lock(&bdev_lock); 740 if (!sb_is_blkdev_sb(inode->i_sb)) 741 bdev = inode->i_bdev; 742 __bd_forget(inode); 743 spin_unlock(&bdev_lock); 744 745 if (bdev) 746 bdput(bdev); 747} 748 749/** 750 * bd_may_claim - test whether a block device can be claimed 751 * @bdev: block device of interest 752 * @whole: whole block device containing @bdev, may equal @bdev 753 * @holder: holder trying to claim @bdev 754 * 755 * Test whether @bdev can be claimed by @holder. 756 * 757 * CONTEXT: 758 * spin_lock(&bdev_lock). 759 * 760 * RETURNS: 761 * %true if @bdev can be claimed, %false otherwise. 762 */ 763static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 764 void *holder) 765{ 766 if (bdev->bd_holder == holder) 767 return true; /* already a holder */ 768 else if (bdev->bd_holder != NULL) 769 return false; /* held by someone else */ 770 else if (bdev->bd_contains == bdev) 771 return true; /* is a whole device which isn't held */ 772 773 else if (whole->bd_holder == bd_may_claim) 774 return true; /* is a partition of a device that is being partitioned */ 775 else if (whole->bd_holder != NULL) 776 return false; /* is a partition of a held device */ 777 else 778 return true; /* is a partition of an un-held device */ 779} 780 781/** 782 * bd_prepare_to_claim - prepare to claim a block device 783 * @bdev: block device of interest 784 * @whole: the whole device containing @bdev, may equal @bdev 785 * @holder: holder trying to claim @bdev 786 * 787 * Prepare to claim @bdev. This function fails if @bdev is already 788 * claimed by another holder and waits if another claiming is in 789 * progress. This function doesn't actually claim. On successful 790 * return, the caller has ownership of bd_claiming and bd_holder[s]. 791 * 792 * CONTEXT: 793 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 794 * it multiple times. 795 * 796 * RETURNS: 797 * 0 if @bdev can be claimed, -EBUSY otherwise. 798 */ 799static int bd_prepare_to_claim(struct block_device *bdev, 800 struct block_device *whole, void *holder) 801{ 802retry: 803 /* if someone else claimed, fail */ 804 if (!bd_may_claim(bdev, whole, holder)) 805 return -EBUSY; 806 807 /* if claiming is already in progress, wait for it to finish */ 808 if (whole->bd_claiming) { 809 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 810 DEFINE_WAIT(wait); 811 812 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 813 spin_unlock(&bdev_lock); 814 schedule(); 815 finish_wait(wq, &wait); 816 spin_lock(&bdev_lock); 817 goto retry; 818 } 819 820 /* yay, all mine */ 821 return 0; 822} 823 824/** 825 * bd_start_claiming - start claiming a block device 826 * @bdev: block device of interest 827 * @holder: holder trying to claim @bdev 828 * 829 * @bdev is about to be opened exclusively. Check @bdev can be opened 830 * exclusively and mark that an exclusive open is in progress. Each 831 * successful call to this function must be matched with a call to 832 * either bd_finish_claiming() or bd_abort_claiming() (which do not 833 * fail). 834 * 835 * This function is used to gain exclusive access to the block device 836 * without actually causing other exclusive open attempts to fail. It 837 * should be used when the open sequence itself requires exclusive 838 * access but may subsequently fail. 839 * 840 * CONTEXT: 841 * Might sleep. 842 * 843 * RETURNS: 844 * Pointer to the block device containing @bdev on success, ERR_PTR() 845 * value on failure. 846 */ 847static struct block_device *bd_start_claiming(struct block_device *bdev, 848 void *holder) 849{ 850 struct gendisk *disk; 851 struct block_device *whole; 852 int partno, err; 853 854 might_sleep(); 855 856 /* 857 * @bdev might not have been initialized properly yet, look up 858 * and grab the outer block device the hard way. 859 */ 860 disk = get_gendisk(bdev->bd_dev, &partno); 861 if (!disk) 862 return ERR_PTR(-ENXIO); 863 864 /* 865 * Normally, @bdev should equal what's returned from bdget_disk() 866 * if partno is 0; however, some drivers (floppy) use multiple 867 * bdev's for the same physical device and @bdev may be one of the 868 * aliases. Keep @bdev if partno is 0. This means claimer 869 * tracking is broken for those devices but it has always been that 870 * way. 871 */ 872 if (partno) 873 whole = bdget_disk(disk, 0); 874 else 875 whole = bdgrab(bdev); 876 877 module_put(disk->fops->owner); 878 put_disk(disk); 879 if (!whole) 880 return ERR_PTR(-ENOMEM); 881 882 /* prepare to claim, if successful, mark claiming in progress */ 883 spin_lock(&bdev_lock); 884 885 err = bd_prepare_to_claim(bdev, whole, holder); 886 if (err == 0) { 887 whole->bd_claiming = holder; 888 spin_unlock(&bdev_lock); 889 return whole; 890 } else { 891 spin_unlock(&bdev_lock); 892 bdput(whole); 893 return ERR_PTR(err); 894 } 895} 896 897#ifdef CONFIG_SYSFS 898struct bd_holder_disk { 899 struct list_head list; 900 struct gendisk *disk; 901 int refcnt; 902}; 903 904static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, 905 struct gendisk *disk) 906{ 907 struct bd_holder_disk *holder; 908 909 list_for_each_entry(holder, &bdev->bd_holder_disks, list) 910 if (holder->disk == disk) 911 return holder; 912 return NULL; 913} 914 915static int add_symlink(struct kobject *from, struct kobject *to) 916{ 917 return sysfs_create_link(from, to, kobject_name(to)); 918} 919 920static void del_symlink(struct kobject *from, struct kobject *to) 921{ 922 sysfs_remove_link(from, kobject_name(to)); 923} 924 925/** 926 * bd_link_disk_holder - create symlinks between holding disk and slave bdev 927 * @bdev: the claimed slave bdev 928 * @disk: the holding disk 929 * 930 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 931 * 932 * This functions creates the following sysfs symlinks. 933 * 934 * - from "slaves" directory of the holder @disk to the claimed @bdev 935 * - from "holders" directory of the @bdev to the holder @disk 936 * 937 * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is 938 * passed to bd_link_disk_holder(), then: 939 * 940 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 941 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 942 * 943 * The caller must have claimed @bdev before calling this function and 944 * ensure that both @bdev and @disk are valid during the creation and 945 * lifetime of these symlinks. 946 * 947 * CONTEXT: 948 * Might sleep. 949 * 950 * RETURNS: 951 * 0 on success, -errno on failure. 952 */ 953int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) 954{ 955 struct bd_holder_disk *holder; 956 int ret = 0; 957 958 mutex_lock(&bdev->bd_mutex); 959 960 WARN_ON_ONCE(!bdev->bd_holder); 961 962 /* FIXME: remove the following once add_disk() handles errors */ 963 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) 964 goto out_unlock; 965 966 holder = bd_find_holder_disk(bdev, disk); 967 if (holder) { 968 holder->refcnt++; 969 goto out_unlock; 970 } 971 972 holder = kzalloc(sizeof(*holder), GFP_KERNEL); 973 if (!holder) { 974 ret = -ENOMEM; 975 goto out_unlock; 976 } 977 978 INIT_LIST_HEAD(&holder->list); 979 holder->disk = disk; 980 holder->refcnt = 1; 981 982 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 983 if (ret) 984 goto out_free; 985 986 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 987 if (ret) 988 goto out_del; 989 /* 990 * bdev could be deleted beneath us which would implicitly destroy 991 * the holder directory. Hold on to it. 992 */ 993 kobject_get(bdev->bd_part->holder_dir); 994 995 list_add(&holder->list, &bdev->bd_holder_disks); 996 goto out_unlock; 997 998out_del: 999 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 1000out_free: 1001 kfree(holder); 1002out_unlock: 1003 mutex_unlock(&bdev->bd_mutex); 1004 return ret; 1005} 1006EXPORT_SYMBOL_GPL(bd_link_disk_holder); 1007 1008/** 1009 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() 1010 * @bdev: the calimed slave bdev 1011 * @disk: the holding disk 1012 * 1013 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. 1014 * 1015 * CONTEXT: 1016 * Might sleep. 1017 */ 1018void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) 1019{ 1020 struct bd_holder_disk *holder; 1021 1022 mutex_lock(&bdev->bd_mutex); 1023 1024 holder = bd_find_holder_disk(bdev, disk); 1025 1026 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { 1027 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 1028 del_symlink(bdev->bd_part->holder_dir, 1029 &disk_to_dev(disk)->kobj); 1030 kobject_put(bdev->bd_part->holder_dir); 1031 list_del_init(&holder->list); 1032 kfree(holder); 1033 } 1034 1035 mutex_unlock(&bdev->bd_mutex); 1036} 1037EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); 1038#endif 1039 1040/** 1041 * flush_disk - invalidates all buffer-cache entries on a disk 1042 * 1043 * @bdev: struct block device to be flushed 1044 * @kill_dirty: flag to guide handling of dirty inodes 1045 * 1046 * Invalidates all buffer-cache entries on a disk. It should be called 1047 * when a disk has been changed -- either by a media change or online 1048 * resize. 1049 */ 1050static void flush_disk(struct block_device *bdev, bool kill_dirty) 1051{ 1052 if (__invalidate_device(bdev, kill_dirty)) { 1053 printk(KERN_WARNING "VFS: busy inodes on changed media or " 1054 "resized disk %s\n", 1055 bdev->bd_disk ? bdev->bd_disk->disk_name : ""); 1056 } 1057 1058 if (!bdev->bd_disk) 1059 return; 1060 if (disk_part_scan_enabled(bdev->bd_disk)) 1061 bdev->bd_invalidated = 1; 1062} 1063 1064/** 1065 * check_disk_size_change - checks for disk size change and adjusts bdev size. 1066 * @disk: struct gendisk to check 1067 * @bdev: struct bdev to adjust. 1068 * 1069 * This routine checks to see if the bdev size does not match the disk size 1070 * and adjusts it if it differs. 1071 */ 1072void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1073{ 1074 loff_t disk_size, bdev_size; 1075 1076 disk_size = (loff_t)get_capacity(disk) << 9; 1077 bdev_size = i_size_read(bdev->bd_inode); 1078 if (disk_size != bdev_size) { 1079 printk(KERN_INFO 1080 "%s: detected capacity change from %lld to %lld\n", 1081 disk->disk_name, bdev_size, disk_size); 1082 i_size_write(bdev->bd_inode, disk_size); 1083 flush_disk(bdev, false); 1084 } 1085} 1086EXPORT_SYMBOL(check_disk_size_change); 1087 1088/** 1089 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1090 * @disk: struct gendisk to be revalidated 1091 * 1092 * This routine is a wrapper for lower-level driver's revalidate_disk 1093 * call-backs. It is used to do common pre and post operations needed 1094 * for all revalidate_disk operations. 1095 */ 1096int revalidate_disk(struct gendisk *disk) 1097{ 1098 struct block_device *bdev; 1099 int ret = 0; 1100 1101 if (disk->fops->revalidate_disk) 1102 ret = disk->fops->revalidate_disk(disk); 1103 blk_integrity_revalidate(disk); 1104 bdev = bdget_disk(disk, 0); 1105 if (!bdev) 1106 return ret; 1107 1108 mutex_lock(&bdev->bd_mutex); 1109 check_disk_size_change(disk, bdev); 1110 bdev->bd_invalidated = 0; 1111 mutex_unlock(&bdev->bd_mutex); 1112 bdput(bdev); 1113 return ret; 1114} 1115EXPORT_SYMBOL(revalidate_disk); 1116 1117/* 1118 * This routine checks whether a removable media has been changed, 1119 * and invalidates all buffer-cache-entries in that case. This 1120 * is a relatively slow routine, so we have to try to minimize using 1121 * it. Thus it is called only upon a 'mount' or 'open'. This 1122 * is the best way of combining speed and utility, I think. 1123 * People changing diskettes in the middle of an operation deserve 1124 * to lose :-) 1125 */ 1126int check_disk_change(struct block_device *bdev) 1127{ 1128 struct gendisk *disk = bdev->bd_disk; 1129 const struct block_device_operations *bdops = disk->fops; 1130 unsigned int events; 1131 1132 events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | 1133 DISK_EVENT_EJECT_REQUEST); 1134 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1135 return 0; 1136 1137 flush_disk(bdev, true); 1138 if (bdops->revalidate_disk) 1139 bdops->revalidate_disk(bdev->bd_disk); 1140 return 1; 1141} 1142 1143EXPORT_SYMBOL(check_disk_change); 1144 1145void bd_set_size(struct block_device *bdev, loff_t size) 1146{ 1147 unsigned bsize = bdev_logical_block_size(bdev); 1148 1149 inode_lock(bdev->bd_inode); 1150 i_size_write(bdev->bd_inode, size); 1151 inode_unlock(bdev->bd_inode); 1152 while (bsize < PAGE_SIZE) { 1153 if (size & bsize) 1154 break; 1155 bsize <<= 1; 1156 } 1157 bdev->bd_block_size = bsize; 1158 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1159} 1160EXPORT_SYMBOL(bd_set_size); 1161 1162static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1163 1164/* 1165 * bd_mutex locking: 1166 * 1167 * mutex_lock(part->bd_mutex) 1168 * mutex_lock_nested(whole->bd_mutex, 1) 1169 */ 1170 1171static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1172{ 1173 struct gendisk *disk; 1174 struct module *owner; 1175 int ret; 1176 int partno; 1177 int perm = 0; 1178 1179 if (mode & FMODE_READ) 1180 perm |= MAY_READ; 1181 if (mode & FMODE_WRITE) 1182 perm |= MAY_WRITE; 1183 /* 1184 * hooks: /n/, see "layering violations". 1185 */ 1186 if (!for_part) { 1187 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1188 if (ret != 0) { 1189 bdput(bdev); 1190 return ret; 1191 } 1192 } 1193 1194 restart: 1195 1196 ret = -ENXIO; 1197 disk = get_gendisk(bdev->bd_dev, &partno); 1198 if (!disk) 1199 goto out; 1200 owner = disk->fops->owner; 1201 1202 disk_block_events(disk); 1203 mutex_lock_nested(&bdev->bd_mutex, for_part); 1204 if (!bdev->bd_openers) { 1205 bdev->bd_disk = disk; 1206 bdev->bd_queue = disk->queue; 1207 bdev->bd_contains = bdev; 1208 if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) 1209 bdev->bd_inode->i_flags = S_DAX; 1210 else 1211 bdev->bd_inode->i_flags = 0; 1212 1213 if (!partno) { 1214 ret = -ENXIO; 1215 bdev->bd_part = disk_get_part(disk, partno); 1216 if (!bdev->bd_part) 1217 goto out_clear; 1218 1219 ret = 0; 1220 if (disk->fops->open) { 1221 ret = disk->fops->open(bdev, mode); 1222 if (ret == -ERESTARTSYS) { 1223 /* Lost a race with 'disk' being 1224 * deleted, try again. 1225 * See md.c 1226 */ 1227 disk_put_part(bdev->bd_part); 1228 bdev->bd_part = NULL; 1229 bdev->bd_disk = NULL; 1230 bdev->bd_queue = NULL; 1231 mutex_unlock(&bdev->bd_mutex); 1232 disk_unblock_events(disk); 1233 put_disk(disk); 1234 module_put(owner); 1235 goto restart; 1236 } 1237 } 1238 1239 if (!ret) { 1240 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1241 if (!blkdev_dax_capable(bdev)) 1242 bdev->bd_inode->i_flags &= ~S_DAX; 1243 } 1244 1245 /* 1246 * If the device is invalidated, rescan partition 1247 * if open succeeded or failed with -ENOMEDIUM. 1248 * The latter is necessary to prevent ghost 1249 * partitions on a removed medium. 1250 */ 1251 if (bdev->bd_invalidated) { 1252 if (!ret) 1253 rescan_partitions(disk, bdev); 1254 else if (ret == -ENOMEDIUM) 1255 invalidate_partitions(disk, bdev); 1256 } 1257 1258 if (ret) 1259 goto out_clear; 1260 } else { 1261 struct block_device *whole; 1262 whole = bdget_disk(disk, 0); 1263 ret = -ENOMEM; 1264 if (!whole) 1265 goto out_clear; 1266 BUG_ON(for_part); 1267 ret = __blkdev_get(whole, mode, 1); 1268 if (ret) 1269 goto out_clear; 1270 bdev->bd_contains = whole; 1271 bdev->bd_part = disk_get_part(disk, partno); 1272 if (!(disk->flags & GENHD_FL_UP) || 1273 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1274 ret = -ENXIO; 1275 goto out_clear; 1276 } 1277 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1278 if (!blkdev_dax_capable(bdev)) 1279 bdev->bd_inode->i_flags &= ~S_DAX; 1280 } 1281 } else { 1282 if (bdev->bd_contains == bdev) { 1283 ret = 0; 1284 if (bdev->bd_disk->fops->open) 1285 ret = bdev->bd_disk->fops->open(bdev, mode); 1286 /* the same as first opener case, read comment there */ 1287 if (bdev->bd_invalidated) { 1288 if (!ret) 1289 rescan_partitions(bdev->bd_disk, bdev); 1290 else if (ret == -ENOMEDIUM) 1291 invalidate_partitions(bdev->bd_disk, bdev); 1292 } 1293 if (ret) 1294 goto out_unlock_bdev; 1295 } 1296 /* only one opener holds refs to the module and disk */ 1297 put_disk(disk); 1298 module_put(owner); 1299 } 1300 bdev->bd_openers++; 1301 if (for_part) 1302 bdev->bd_part_count++; 1303 mutex_unlock(&bdev->bd_mutex); 1304 disk_unblock_events(disk); 1305 return 0; 1306 1307 out_clear: 1308 disk_put_part(bdev->bd_part); 1309 bdev->bd_disk = NULL; 1310 bdev->bd_part = NULL; 1311 bdev->bd_queue = NULL; 1312 if (bdev != bdev->bd_contains) 1313 __blkdev_put(bdev->bd_contains, mode, 1); 1314 bdev->bd_contains = NULL; 1315 out_unlock_bdev: 1316 mutex_unlock(&bdev->bd_mutex); 1317 disk_unblock_events(disk); 1318 put_disk(disk); 1319 module_put(owner); 1320 out: 1321 bdput(bdev); 1322 1323 return ret; 1324} 1325 1326/** 1327 * blkdev_get - open a block device 1328 * @bdev: block_device to open 1329 * @mode: FMODE_* mask 1330 * @holder: exclusive holder identifier 1331 * 1332 * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is 1333 * open with exclusive access. Specifying %FMODE_EXCL with %NULL 1334 * @holder is invalid. Exclusive opens may nest for the same @holder. 1335 * 1336 * On success, the reference count of @bdev is unchanged. On failure, 1337 * @bdev is put. 1338 * 1339 * CONTEXT: 1340 * Might sleep. 1341 * 1342 * RETURNS: 1343 * 0 on success, -errno on failure. 1344 */ 1345int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) 1346{ 1347 struct block_device *whole = NULL; 1348 int res; 1349 1350 WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); 1351 1352 if ((mode & FMODE_EXCL) && holder) { 1353 whole = bd_start_claiming(bdev, holder); 1354 if (IS_ERR(whole)) { 1355 bdput(bdev); 1356 return PTR_ERR(whole); 1357 } 1358 } 1359 1360 res = __blkdev_get(bdev, mode, 0); 1361 1362 if (whole) { 1363 struct gendisk *disk = whole->bd_disk; 1364 1365 /* finish claiming */ 1366 mutex_lock(&bdev->bd_mutex); 1367 spin_lock(&bdev_lock); 1368 1369 if (!res) { 1370 BUG_ON(!bd_may_claim(bdev, whole, holder)); 1371 /* 1372 * Note that for a whole device bd_holders 1373 * will be incremented twice, and bd_holder 1374 * will be set to bd_may_claim before being 1375 * set to holder 1376 */ 1377 whole->bd_holders++; 1378 whole->bd_holder = bd_may_claim; 1379 bdev->bd_holders++; 1380 bdev->bd_holder = holder; 1381 } 1382 1383 /* tell others that we're done */ 1384 BUG_ON(whole->bd_claiming != holder); 1385 whole->bd_claiming = NULL; 1386 wake_up_bit(&whole->bd_claiming, 0); 1387 1388 spin_unlock(&bdev_lock); 1389 1390 /* 1391 * Block event polling for write claims if requested. Any 1392 * write holder makes the write_holder state stick until 1393 * all are released. This is good enough and tracking 1394 * individual writeable reference is too fragile given the 1395 * way @mode is used in blkdev_get/put(). 1396 */ 1397 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && 1398 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { 1399 bdev->bd_write_holder = true; 1400 disk_block_events(disk); 1401 } 1402 1403 mutex_unlock(&bdev->bd_mutex); 1404 bdput(whole); 1405 } 1406 1407 return res; 1408} 1409EXPORT_SYMBOL(blkdev_get); 1410 1411/** 1412 * blkdev_get_by_path - open a block device by name 1413 * @path: path to the block device to open 1414 * @mode: FMODE_* mask 1415 * @holder: exclusive holder identifier 1416 * 1417 * Open the blockdevice described by the device file at @path. @mode 1418 * and @holder are identical to blkdev_get(). 1419 * 1420 * On success, the returned block_device has reference count of one. 1421 * 1422 * CONTEXT: 1423 * Might sleep. 1424 * 1425 * RETURNS: 1426 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1427 */ 1428struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, 1429 void *holder) 1430{ 1431 struct block_device *bdev; 1432 int err; 1433 1434 bdev = lookup_bdev(path); 1435 if (IS_ERR(bdev)) 1436 return bdev; 1437 1438 err = blkdev_get(bdev, mode, holder); 1439 if (err) 1440 return ERR_PTR(err); 1441 1442 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { 1443 blkdev_put(bdev, mode); 1444 return ERR_PTR(-EACCES); 1445 } 1446 1447 return bdev; 1448} 1449EXPORT_SYMBOL(blkdev_get_by_path); 1450 1451/** 1452 * blkdev_get_by_dev - open a block device by device number 1453 * @dev: device number of block device to open 1454 * @mode: FMODE_* mask 1455 * @holder: exclusive holder identifier 1456 * 1457 * Open the blockdevice described by device number @dev. @mode and 1458 * @holder are identical to blkdev_get(). 1459 * 1460 * Use it ONLY if you really do not have anything better - i.e. when 1461 * you are behind a truly sucky interface and all you are given is a 1462 * device number. _Never_ to be used for internal purposes. If you 1463 * ever need it - reconsider your API. 1464 * 1465 * On success, the returned block_device has reference count of one. 1466 * 1467 * CONTEXT: 1468 * Might sleep. 1469 * 1470 * RETURNS: 1471 * Pointer to block_device on success, ERR_PTR(-errno) on failure. 1472 */ 1473struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) 1474{ 1475 struct block_device *bdev; 1476 int err; 1477 1478 bdev = bdget(dev); 1479 if (!bdev) 1480 return ERR_PTR(-ENOMEM); 1481 1482 err = blkdev_get(bdev, mode, holder); 1483 if (err) 1484 return ERR_PTR(err); 1485 1486 return bdev; 1487} 1488EXPORT_SYMBOL(blkdev_get_by_dev); 1489 1490static int blkdev_open(struct inode * inode, struct file * filp) 1491{ 1492 struct block_device *bdev; 1493 1494 /* 1495 * Preserve backwards compatibility and allow large file access 1496 * even if userspace doesn't ask for it explicitly. Some mkfs 1497 * binary needs it. We might want to drop this workaround 1498 * during an unstable branch. 1499 */ 1500 filp->f_flags |= O_LARGEFILE; 1501 1502 if (filp->f_flags & O_NDELAY) 1503 filp->f_mode |= FMODE_NDELAY; 1504 if (filp->f_flags & O_EXCL) 1505 filp->f_mode |= FMODE_EXCL; 1506 if ((filp->f_flags & O_ACCMODE) == 3) 1507 filp->f_mode |= FMODE_WRITE_IOCTL; 1508 1509 bdev = bd_acquire(inode); 1510 if (bdev == NULL) 1511 return -ENOMEM; 1512 1513 filp->f_mapping = bdev->bd_inode->i_mapping; 1514 1515 return blkdev_get(bdev, filp->f_mode, filp); 1516} 1517 1518static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1519{ 1520 struct gendisk *disk = bdev->bd_disk; 1521 struct block_device *victim = NULL; 1522 1523 mutex_lock_nested(&bdev->bd_mutex, for_part); 1524 if (for_part) 1525 bdev->bd_part_count--; 1526 1527 if (!--bdev->bd_openers) { 1528 WARN_ON_ONCE(bdev->bd_holders); 1529 sync_blockdev(bdev); 1530 kill_bdev(bdev); 1531 1532 bdev_write_inode(bdev); 1533 /* 1534 * Detaching bdev inode from its wb in __destroy_inode() 1535 * is too late: the queue which embeds its bdi (along with 1536 * root wb) can be gone as soon as we put_disk() below. 1537 */ 1538 inode_detach_wb(bdev->bd_inode); 1539 } 1540 if (bdev->bd_contains == bdev) { 1541 if (disk->fops->release) 1542 disk->fops->release(disk, mode); 1543 } 1544 if (!bdev->bd_openers) { 1545 struct module *owner = disk->fops->owner; 1546 1547 disk_put_part(bdev->bd_part); 1548 bdev->bd_part = NULL; 1549 bdev->bd_disk = NULL; 1550 if (bdev != bdev->bd_contains) 1551 victim = bdev->bd_contains; 1552 bdev->bd_contains = NULL; 1553 1554 put_disk(disk); 1555 module_put(owner); 1556 } 1557 mutex_unlock(&bdev->bd_mutex); 1558 bdput(bdev); 1559 if (victim) 1560 __blkdev_put(victim, mode, 1); 1561} 1562 1563void blkdev_put(struct block_device *bdev, fmode_t mode) 1564{ 1565 mutex_lock(&bdev->bd_mutex); 1566 1567 if (mode & FMODE_EXCL) { 1568 bool bdev_free; 1569 1570 /* 1571 * Release a claim on the device. The holder fields 1572 * are protected with bdev_lock. bd_mutex is to 1573 * synchronize disk_holder unlinking. 1574 */ 1575 spin_lock(&bdev_lock); 1576 1577 WARN_ON_ONCE(--bdev->bd_holders < 0); 1578 WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); 1579 1580 /* bd_contains might point to self, check in a separate step */ 1581 if ((bdev_free = !bdev->bd_holders)) 1582 bdev->bd_holder = NULL; 1583 if (!bdev->bd_contains->bd_holders) 1584 bdev->bd_contains->bd_holder = NULL; 1585 1586 spin_unlock(&bdev_lock); 1587 1588 /* 1589 * If this was the last claim, remove holder link and 1590 * unblock evpoll if it was a write holder. 1591 */ 1592 if (bdev_free && bdev->bd_write_holder) { 1593 disk_unblock_events(bdev->bd_disk); 1594 bdev->bd_write_holder = false; 1595 } 1596 } 1597 1598 /* 1599 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1600 * event. This is to ensure detection of media removal commanded 1601 * from userland - e.g. eject(1). 1602 */ 1603 disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); 1604 1605 mutex_unlock(&bdev->bd_mutex); 1606 1607 __blkdev_put(bdev, mode, 0); 1608} 1609EXPORT_SYMBOL(blkdev_put); 1610 1611static int blkdev_close(struct inode * inode, struct file * filp) 1612{ 1613 struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); 1614 blkdev_put(bdev, filp->f_mode); 1615 return 0; 1616} 1617 1618static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1619{ 1620 struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 1621 fmode_t mode = file->f_mode; 1622 1623 /* 1624 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1625 * to updated it before every ioctl. 1626 */ 1627 if (file->f_flags & O_NDELAY) 1628 mode |= FMODE_NDELAY; 1629 else 1630 mode &= ~FMODE_NDELAY; 1631 1632 return blkdev_ioctl(bdev, mode, cmd, arg); 1633} 1634 1635/* 1636 * Write data to the block device. Only intended for the block device itself 1637 * and the raw driver which basically is a fake block device. 1638 * 1639 * Does not take i_mutex for the write and thus is not for general purpose 1640 * use. 1641 */ 1642ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 1643{ 1644 struct file *file = iocb->ki_filp; 1645 struct inode *bd_inode = bdev_file_inode(file); 1646 loff_t size = i_size_read(bd_inode); 1647 struct blk_plug plug; 1648 ssize_t ret; 1649 1650 if (bdev_read_only(I_BDEV(bd_inode))) 1651 return -EPERM; 1652 1653 if (!iov_iter_count(from)) 1654 return 0; 1655 1656 if (iocb->ki_pos >= size) 1657 return -ENOSPC; 1658 1659 iov_iter_truncate(from, size - iocb->ki_pos); 1660 1661 blk_start_plug(&plug); 1662 ret = __generic_file_write_iter(iocb, from); 1663 if (ret > 0) { 1664 ssize_t err; 1665 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 1666 if (err < 0) 1667 ret = err; 1668 } 1669 blk_finish_plug(&plug); 1670 return ret; 1671} 1672EXPORT_SYMBOL_GPL(blkdev_write_iter); 1673 1674ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 1675{ 1676 struct file *file = iocb->ki_filp; 1677 struct inode *bd_inode = bdev_file_inode(file); 1678 loff_t size = i_size_read(bd_inode); 1679 loff_t pos = iocb->ki_pos; 1680 1681 if (pos >= size) 1682 return 0; 1683 1684 size -= pos; 1685 iov_iter_truncate(to, size); 1686 return generic_file_read_iter(iocb, to); 1687} 1688EXPORT_SYMBOL_GPL(blkdev_read_iter); 1689 1690/* 1691 * Try to release a page associated with block device when the system 1692 * is under memory pressure. 1693 */ 1694static int blkdev_releasepage(struct page *page, gfp_t wait) 1695{ 1696 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1697 1698 if (super && super->s_op->bdev_try_to_free_page) 1699 return super->s_op->bdev_try_to_free_page(super, page, wait); 1700 1701 return try_to_free_buffers(page); 1702} 1703 1704static int blkdev_writepages(struct address_space *mapping, 1705 struct writeback_control *wbc) 1706{ 1707 if (dax_mapping(mapping)) { 1708 struct block_device *bdev = I_BDEV(mapping->host); 1709 1710 return dax_writeback_mapping_range(mapping, bdev, wbc); 1711 } 1712 return generic_writepages(mapping, wbc); 1713} 1714 1715static const struct address_space_operations def_blk_aops = { 1716 .readpage = blkdev_readpage, 1717 .readpages = blkdev_readpages, 1718 .writepage = blkdev_writepage, 1719 .write_begin = blkdev_write_begin, 1720 .write_end = blkdev_write_end, 1721 .writepages = blkdev_writepages, 1722 .releasepage = blkdev_releasepage, 1723 .direct_IO = blkdev_direct_IO, 1724 .is_dirty_writeback = buffer_check_dirty_writeback, 1725}; 1726 1727#ifdef CONFIG_FS_DAX 1728/* 1729 * In the raw block case we do not need to contend with truncation nor 1730 * unwritten file extents. Without those concerns there is no need for 1731 * additional locking beyond the mmap_sem context that these routines 1732 * are already executing under. 1733 * 1734 * Note, there is no protection if the block device is dynamically 1735 * resized (partition grow/shrink) during a fault. A stable block device 1736 * size is already not enforced in the blkdev_direct_IO path. 1737 * 1738 * For DAX, it is the responsibility of the block device driver to 1739 * ensure the whole-disk device size is stable while requests are in 1740 * flight. 1741 * 1742 * Finally, unlike the filemap_page_mkwrite() case there is no 1743 * filesystem superblock to sync against freezing. We still include a 1744 * pfn_mkwrite callback for dax drivers to receive write fault 1745 * notifications. 1746 */ 1747static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1748{ 1749 return __dax_fault(vma, vmf, blkdev_get_block, NULL); 1750} 1751 1752static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, 1753 struct vm_fault *vmf) 1754{ 1755 return dax_pfn_mkwrite(vma, vmf); 1756} 1757 1758static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 1759 pmd_t *pmd, unsigned int flags) 1760{ 1761 return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); 1762} 1763 1764static const struct vm_operations_struct blkdev_dax_vm_ops = { 1765 .fault = blkdev_dax_fault, 1766 .pmd_fault = blkdev_dax_pmd_fault, 1767 .pfn_mkwrite = blkdev_dax_pfn_mkwrite, 1768}; 1769 1770static const struct vm_operations_struct blkdev_default_vm_ops = { 1771 .fault = filemap_fault, 1772 .map_pages = filemap_map_pages, 1773}; 1774 1775static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) 1776{ 1777 struct inode *bd_inode = bdev_file_inode(file); 1778 1779 file_accessed(file); 1780 if (IS_DAX(bd_inode)) { 1781 vma->vm_ops = &blkdev_dax_vm_ops; 1782 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 1783 } else { 1784 vma->vm_ops = &blkdev_default_vm_ops; 1785 } 1786 1787 return 0; 1788} 1789#else 1790#define blkdev_mmap generic_file_mmap 1791#endif 1792 1793const struct file_operations def_blk_fops = { 1794 .open = blkdev_open, 1795 .release = blkdev_close, 1796 .llseek = block_llseek, 1797 .read_iter = blkdev_read_iter, 1798 .write_iter = blkdev_write_iter, 1799 .mmap = blkdev_mmap, 1800 .fsync = blkdev_fsync, 1801 .unlocked_ioctl = block_ioctl, 1802#ifdef CONFIG_COMPAT 1803 .compat_ioctl = compat_blkdev_ioctl, 1804#endif 1805 .splice_read = generic_file_splice_read, 1806 .splice_write = iter_file_splice_write, 1807}; 1808 1809int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1810{ 1811 int res; 1812 mm_segment_t old_fs = get_fs(); 1813 set_fs(KERNEL_DS); 1814 res = blkdev_ioctl(bdev, 0, cmd, arg); 1815 set_fs(old_fs); 1816 return res; 1817} 1818 1819EXPORT_SYMBOL(ioctl_by_bdev); 1820 1821/** 1822 * lookup_bdev - lookup a struct block_device by name 1823 * @pathname: special file representing the block device 1824 * 1825 * Get a reference to the blockdevice at @pathname in the current 1826 * namespace if possible and return it. Return ERR_PTR(error) 1827 * otherwise. 1828 */ 1829struct block_device *lookup_bdev(const char *pathname) 1830{ 1831 struct block_device *bdev; 1832 struct inode *inode; 1833 struct path path; 1834 int error; 1835 1836 if (!pathname || !*pathname) 1837 return ERR_PTR(-EINVAL); 1838 1839 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1840 if (error) 1841 return ERR_PTR(error); 1842 1843 inode = d_backing_inode(path.dentry); 1844 error = -ENOTBLK; 1845 if (!S_ISBLK(inode->i_mode)) 1846 goto fail; 1847 error = -EACCES; 1848 if (path.mnt->mnt_flags & MNT_NODEV) 1849 goto fail; 1850 error = -ENOMEM; 1851 bdev = bd_acquire(inode); 1852 if (!bdev) 1853 goto fail; 1854out: 1855 path_put(&path); 1856 return bdev; 1857fail: 1858 bdev = ERR_PTR(error); 1859 goto out; 1860} 1861EXPORT_SYMBOL(lookup_bdev); 1862 1863int __invalidate_device(struct block_device *bdev, bool kill_dirty) 1864{ 1865 struct super_block *sb = get_super(bdev); 1866 int res = 0; 1867 1868 if (sb) { 1869 /* 1870 * no need to lock the super, get_super holds the 1871 * read mutex so the filesystem cannot go away 1872 * under us (->put_super runs with the write lock 1873 * hold). 1874 */ 1875 shrink_dcache_sb(sb); 1876 res = invalidate_inodes(sb, kill_dirty); 1877 drop_super(sb); 1878 } 1879 invalidate_bdev(bdev); 1880 return res; 1881} 1882EXPORT_SYMBOL(__invalidate_device); 1883 1884void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) 1885{ 1886 struct inode *inode, *old_inode = NULL; 1887 1888 spin_lock(&blockdev_superblock->s_inode_list_lock); 1889 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1890 struct address_space *mapping = inode->i_mapping; 1891 1892 spin_lock(&inode->i_lock); 1893 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1894 mapping->nrpages == 0) { 1895 spin_unlock(&inode->i_lock); 1896 continue; 1897 } 1898 __iget(inode); 1899 spin_unlock(&inode->i_lock); 1900 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1901 /* 1902 * We hold a reference to 'inode' so it couldn't have been 1903 * removed from s_inodes list while we dropped the 1904 * s_inode_list_lock We cannot iput the inode now as we can 1905 * be holding the last reference and we cannot iput it under 1906 * s_inode_list_lock. So we keep the reference and iput it 1907 * later. 1908 */ 1909 iput(old_inode); 1910 old_inode = inode; 1911 1912 func(I_BDEV(inode), arg); 1913 1914 spin_lock(&blockdev_superblock->s_inode_list_lock); 1915 } 1916 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1917 iput(old_inode); 1918}