at v2.6.37-rc2 1799 lines 44 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/smp_lock.h> 15#include <linux/device_cgroup.h> 16#include <linux/highmem.h> 17#include <linux/blkdev.h> 18#include <linux/module.h> 19#include <linux/blkpg.h> 20#include <linux/buffer_head.h> 21#include <linux/pagevec.h> 22#include <linux/writeback.h> 23#include <linux/mpage.h> 24#include <linux/mount.h> 25#include <linux/uio.h> 26#include <linux/namei.h> 27#include <linux/log2.h> 28#include <linux/kmemleak.h> 29#include <asm/uaccess.h> 30#include "internal.h" 31 32struct bdev_inode { 33 struct block_device bdev; 34 struct inode vfs_inode; 35}; 36 37static const struct address_space_operations def_blk_aops; 38 39static inline struct bdev_inode *BDEV_I(struct inode *inode) 40{ 41 return container_of(inode, struct bdev_inode, vfs_inode); 42} 43 44inline struct block_device *I_BDEV(struct inode *inode) 45{ 46 return &BDEV_I(inode)->bdev; 47} 48 49EXPORT_SYMBOL(I_BDEV); 50 51/* 52 * move the inode from it's current bdi to the a new bdi. if the inode is dirty 53 * we need to move it onto the dirty list of @dst so that the inode is always 54 * on the right list. 55 */ 56static void bdev_inode_switch_bdi(struct inode *inode, 57 struct backing_dev_info *dst) 58{ 59 spin_lock(&inode_lock); 60 inode->i_data.backing_dev_info = dst; 61 if (inode->i_state & I_DIRTY) 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 63 spin_unlock(&inode_lock); 64} 65 66static sector_t max_block(struct block_device *bdev) 67{ 68 sector_t retval = ~((sector_t)0); 69 loff_t sz = i_size_read(bdev->bd_inode); 70 71 if (sz) { 72 unsigned int size = block_size(bdev); 73 unsigned int sizebits = blksize_bits(size); 74 retval = (sz >> sizebits); 75 } 76 return retval; 77} 78 79/* Kill _all_ buffers and pagecache , dirty or not.. */ 80static void kill_bdev(struct block_device *bdev) 81{ 82 if (bdev->bd_inode->i_mapping->nrpages == 0) 83 return; 84 invalidate_bh_lrus(); 85 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 86} 87 88int set_blocksize(struct block_device *bdev, int size) 89{ 90 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 91 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 92 return -EINVAL; 93 94 /* Size cannot be smaller than the size supported by the device */ 95 if (size < bdev_logical_block_size(bdev)) 96 return -EINVAL; 97 98 /* Don't change the size if it is same as current */ 99 if (bdev->bd_block_size != size) { 100 sync_blockdev(bdev); 101 bdev->bd_block_size = size; 102 bdev->bd_inode->i_blkbits = blksize_bits(size); 103 kill_bdev(bdev); 104 } 105 return 0; 106} 107 108EXPORT_SYMBOL(set_blocksize); 109 110int sb_set_blocksize(struct super_block *sb, int size) 111{ 112 if (set_blocksize(sb->s_bdev, size)) 113 return 0; 114 /* If we get here, we know size is power of two 115 * and it's value is between 512 and PAGE_SIZE */ 116 sb->s_blocksize = size; 117 sb->s_blocksize_bits = blksize_bits(size); 118 return sb->s_blocksize; 119} 120 121EXPORT_SYMBOL(sb_set_blocksize); 122 123int sb_min_blocksize(struct super_block *sb, int size) 124{ 125 int minsize = bdev_logical_block_size(sb->s_bdev); 126 if (size < minsize) 127 size = minsize; 128 return sb_set_blocksize(sb, size); 129} 130 131EXPORT_SYMBOL(sb_min_blocksize); 132 133static int 134blkdev_get_block(struct inode *inode, sector_t iblock, 135 struct buffer_head *bh, int create) 136{ 137 if (iblock >= max_block(I_BDEV(inode))) { 138 if (create) 139 return -EIO; 140 141 /* 142 * for reads, we're just trying to fill a partial page. 143 * return a hole, they will have to call get_block again 144 * before they can fill it, and they will get -EIO at that 145 * time 146 */ 147 return 0; 148 } 149 bh->b_bdev = I_BDEV(inode); 150 bh->b_blocknr = iblock; 151 set_buffer_mapped(bh); 152 return 0; 153} 154 155static int 156blkdev_get_blocks(struct inode *inode, sector_t iblock, 157 struct buffer_head *bh, int create) 158{ 159 sector_t end_block = max_block(I_BDEV(inode)); 160 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 161 162 if ((iblock + max_blocks) > end_block) { 163 max_blocks = end_block - iblock; 164 if ((long)max_blocks <= 0) { 165 if (create) 166 return -EIO; /* write fully beyond EOF */ 167 /* 168 * It is a read which is fully beyond EOF. We return 169 * a !buffer_mapped buffer 170 */ 171 max_blocks = 0; 172 } 173 } 174 175 bh->b_bdev = I_BDEV(inode); 176 bh->b_blocknr = iblock; 177 bh->b_size = max_blocks << inode->i_blkbits; 178 if (max_blocks) 179 set_buffer_mapped(bh); 180 return 0; 181} 182 183static ssize_t 184blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 185 loff_t offset, unsigned long nr_segs) 186{ 187 struct file *file = iocb->ki_filp; 188 struct inode *inode = file->f_mapping->host; 189 190 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 191 nr_segs, blkdev_get_blocks, NULL, NULL, 0); 192} 193 194int __sync_blockdev(struct block_device *bdev, int wait) 195{ 196 if (!bdev) 197 return 0; 198 if (!wait) 199 return filemap_flush(bdev->bd_inode->i_mapping); 200 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 201} 202 203/* 204 * Write out and wait upon all the dirty data associated with a block 205 * device via its mapping. Does not take the superblock lock. 206 */ 207int sync_blockdev(struct block_device *bdev) 208{ 209 return __sync_blockdev(bdev, 1); 210} 211EXPORT_SYMBOL(sync_blockdev); 212 213/* 214 * Write out and wait upon all dirty data associated with this 215 * device. Filesystem data as well as the underlying block 216 * device. Takes the superblock lock. 217 */ 218int fsync_bdev(struct block_device *bdev) 219{ 220 struct super_block *sb = get_super(bdev); 221 if (sb) { 222 int res = sync_filesystem(sb); 223 drop_super(sb); 224 return res; 225 } 226 return sync_blockdev(bdev); 227} 228EXPORT_SYMBOL(fsync_bdev); 229 230/** 231 * freeze_bdev -- lock a filesystem and force it into a consistent state 232 * @bdev: blockdevice to lock 233 * 234 * If a superblock is found on this device, we take the s_umount semaphore 235 * on it to make sure nobody unmounts until the snapshot creation is done. 236 * The reference counter (bd_fsfreeze_count) guarantees that only the last 237 * unfreeze process can unfreeze the frozen filesystem actually when multiple 238 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 239 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 240 * actually. 241 */ 242struct super_block *freeze_bdev(struct block_device *bdev) 243{ 244 struct super_block *sb; 245 int error = 0; 246 247 mutex_lock(&bdev->bd_fsfreeze_mutex); 248 if (++bdev->bd_fsfreeze_count > 1) { 249 /* 250 * We don't even need to grab a reference - the first call 251 * to freeze_bdev grab an active reference and only the last 252 * thaw_bdev drops it. 253 */ 254 sb = get_super(bdev); 255 drop_super(sb); 256 mutex_unlock(&bdev->bd_fsfreeze_mutex); 257 return sb; 258 } 259 260 sb = get_active_super(bdev); 261 if (!sb) 262 goto out; 263 error = freeze_super(sb); 264 if (error) { 265 deactivate_super(sb); 266 bdev->bd_fsfreeze_count--; 267 mutex_unlock(&bdev->bd_fsfreeze_mutex); 268 return ERR_PTR(error); 269 } 270 deactivate_super(sb); 271 out: 272 sync_blockdev(bdev); 273 mutex_unlock(&bdev->bd_fsfreeze_mutex); 274 return sb; /* thaw_bdev releases s->s_umount */ 275} 276EXPORT_SYMBOL(freeze_bdev); 277 278/** 279 * thaw_bdev -- unlock filesystem 280 * @bdev: blockdevice to unlock 281 * @sb: associated superblock 282 * 283 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 284 */ 285int thaw_bdev(struct block_device *bdev, struct super_block *sb) 286{ 287 int error = -EINVAL; 288 289 mutex_lock(&bdev->bd_fsfreeze_mutex); 290 if (!bdev->bd_fsfreeze_count) 291 goto out; 292 293 error = 0; 294 if (--bdev->bd_fsfreeze_count > 0) 295 goto out; 296 297 if (!sb) 298 goto out; 299 300 error = thaw_super(sb); 301 if (error) { 302 bdev->bd_fsfreeze_count++; 303 mutex_unlock(&bdev->bd_fsfreeze_mutex); 304 return error; 305 } 306out: 307 mutex_unlock(&bdev->bd_fsfreeze_mutex); 308 return 0; 309} 310EXPORT_SYMBOL(thaw_bdev); 311 312static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 313{ 314 return block_write_full_page(page, blkdev_get_block, wbc); 315} 316 317static int blkdev_readpage(struct file * file, struct page * page) 318{ 319 return block_read_full_page(page, blkdev_get_block); 320} 321 322static int blkdev_write_begin(struct file *file, struct address_space *mapping, 323 loff_t pos, unsigned len, unsigned flags, 324 struct page **pagep, void **fsdata) 325{ 326 return block_write_begin(mapping, pos, len, flags, pagep, 327 blkdev_get_block); 328} 329 330static int blkdev_write_end(struct file *file, struct address_space *mapping, 331 loff_t pos, unsigned len, unsigned copied, 332 struct page *page, void *fsdata) 333{ 334 int ret; 335 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 336 337 unlock_page(page); 338 page_cache_release(page); 339 340 return ret; 341} 342 343/* 344 * private llseek: 345 * for a block special file file->f_path.dentry->d_inode->i_size is zero 346 * so we compute the size by hand (just as in block_read/write above) 347 */ 348static loff_t block_llseek(struct file *file, loff_t offset, int origin) 349{ 350 struct inode *bd_inode = file->f_mapping->host; 351 loff_t size; 352 loff_t retval; 353 354 mutex_lock(&bd_inode->i_mutex); 355 size = i_size_read(bd_inode); 356 357 switch (origin) { 358 case 2: 359 offset += size; 360 break; 361 case 1: 362 offset += file->f_pos; 363 } 364 retval = -EINVAL; 365 if (offset >= 0 && offset <= size) { 366 if (offset != file->f_pos) { 367 file->f_pos = offset; 368 } 369 retval = offset; 370 } 371 mutex_unlock(&bd_inode->i_mutex); 372 return retval; 373} 374 375int blkdev_fsync(struct file *filp, int datasync) 376{ 377 struct inode *bd_inode = filp->f_mapping->host; 378 struct block_device *bdev = I_BDEV(bd_inode); 379 int error; 380 381 /* 382 * There is no need to serialise calls to blkdev_issue_flush with 383 * i_mutex and doing so causes performance issues with concurrent 384 * O_SYNC writers to a block device. 385 */ 386 mutex_unlock(&bd_inode->i_mutex); 387 388 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); 389 if (error == -EOPNOTSUPP) 390 error = 0; 391 392 mutex_lock(&bd_inode->i_mutex); 393 394 return error; 395} 396EXPORT_SYMBOL(blkdev_fsync); 397 398/* 399 * pseudo-fs 400 */ 401 402static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 403static struct kmem_cache * bdev_cachep __read_mostly; 404 405static struct inode *bdev_alloc_inode(struct super_block *sb) 406{ 407 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 408 if (!ei) 409 return NULL; 410 return &ei->vfs_inode; 411} 412 413static void bdev_destroy_inode(struct inode *inode) 414{ 415 struct bdev_inode *bdi = BDEV_I(inode); 416 417 kmem_cache_free(bdev_cachep, bdi); 418} 419 420static void init_once(void *foo) 421{ 422 struct bdev_inode *ei = (struct bdev_inode *) foo; 423 struct block_device *bdev = &ei->bdev; 424 425 memset(bdev, 0, sizeof(*bdev)); 426 mutex_init(&bdev->bd_mutex); 427 INIT_LIST_HEAD(&bdev->bd_inodes); 428 INIT_LIST_HEAD(&bdev->bd_list); 429#ifdef CONFIG_SYSFS 430 INIT_LIST_HEAD(&bdev->bd_holder_list); 431#endif 432 inode_init_once(&ei->vfs_inode); 433 /* Initialize mutex for freeze. */ 434 mutex_init(&bdev->bd_fsfreeze_mutex); 435} 436 437static inline void __bd_forget(struct inode *inode) 438{ 439 list_del_init(&inode->i_devices); 440 inode->i_bdev = NULL; 441 inode->i_mapping = &inode->i_data; 442} 443 444static void bdev_evict_inode(struct inode *inode) 445{ 446 struct block_device *bdev = &BDEV_I(inode)->bdev; 447 struct list_head *p; 448 truncate_inode_pages(&inode->i_data, 0); 449 invalidate_inode_buffers(inode); /* is it needed here? */ 450 end_writeback(inode); 451 spin_lock(&bdev_lock); 452 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 453 __bd_forget(list_entry(p, struct inode, i_devices)); 454 } 455 list_del_init(&bdev->bd_list); 456 spin_unlock(&bdev_lock); 457} 458 459static const struct super_operations bdev_sops = { 460 .statfs = simple_statfs, 461 .alloc_inode = bdev_alloc_inode, 462 .destroy_inode = bdev_destroy_inode, 463 .drop_inode = generic_delete_inode, 464 .evict_inode = bdev_evict_inode, 465}; 466 467static struct dentry *bd_mount(struct file_system_type *fs_type, 468 int flags, const char *dev_name, void *data) 469{ 470 return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); 471} 472 473static struct file_system_type bd_type = { 474 .name = "bdev", 475 .mount = bd_mount, 476 .kill_sb = kill_anon_super, 477}; 478 479struct super_block *blockdev_superblock __read_mostly; 480 481void __init bdev_cache_init(void) 482{ 483 int err; 484 struct vfsmount *bd_mnt; 485 486 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 487 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 488 SLAB_MEM_SPREAD|SLAB_PANIC), 489 init_once); 490 err = register_filesystem(&bd_type); 491 if (err) 492 panic("Cannot register bdev pseudo-fs"); 493 bd_mnt = kern_mount(&bd_type); 494 if (IS_ERR(bd_mnt)) 495 panic("Cannot create bdev pseudo-fs"); 496 /* 497 * This vfsmount structure is only used to obtain the 498 * blockdev_superblock, so tell kmemleak not to report it. 499 */ 500 kmemleak_not_leak(bd_mnt); 501 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 502} 503 504/* 505 * Most likely _very_ bad one - but then it's hardly critical for small 506 * /dev and can be fixed when somebody will need really large one. 507 * Keep in mind that it will be fed through icache hash function too. 508 */ 509static inline unsigned long hash(dev_t dev) 510{ 511 return MAJOR(dev)+MINOR(dev); 512} 513 514static int bdev_test(struct inode *inode, void *data) 515{ 516 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 517} 518 519static int bdev_set(struct inode *inode, void *data) 520{ 521 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 522 return 0; 523} 524 525static LIST_HEAD(all_bdevs); 526 527struct block_device *bdget(dev_t dev) 528{ 529 struct block_device *bdev; 530 struct inode *inode; 531 532 inode = iget5_locked(blockdev_superblock, hash(dev), 533 bdev_test, bdev_set, &dev); 534 535 if (!inode) 536 return NULL; 537 538 bdev = &BDEV_I(inode)->bdev; 539 540 if (inode->i_state & I_NEW) { 541 bdev->bd_contains = NULL; 542 bdev->bd_inode = inode; 543 bdev->bd_block_size = (1 << inode->i_blkbits); 544 bdev->bd_part_count = 0; 545 bdev->bd_invalidated = 0; 546 inode->i_mode = S_IFBLK; 547 inode->i_rdev = dev; 548 inode->i_bdev = bdev; 549 inode->i_data.a_ops = &def_blk_aops; 550 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 551 inode->i_data.backing_dev_info = &default_backing_dev_info; 552 spin_lock(&bdev_lock); 553 list_add(&bdev->bd_list, &all_bdevs); 554 spin_unlock(&bdev_lock); 555 unlock_new_inode(inode); 556 } 557 return bdev; 558} 559 560EXPORT_SYMBOL(bdget); 561 562/** 563 * bdgrab -- Grab a reference to an already referenced block device 564 * @bdev: Block device to grab a reference to. 565 */ 566struct block_device *bdgrab(struct block_device *bdev) 567{ 568 ihold(bdev->bd_inode); 569 return bdev; 570} 571 572long nr_blockdev_pages(void) 573{ 574 struct block_device *bdev; 575 long ret = 0; 576 spin_lock(&bdev_lock); 577 list_for_each_entry(bdev, &all_bdevs, bd_list) { 578 ret += bdev->bd_inode->i_mapping->nrpages; 579 } 580 spin_unlock(&bdev_lock); 581 return ret; 582} 583 584void bdput(struct block_device *bdev) 585{ 586 iput(bdev->bd_inode); 587} 588 589EXPORT_SYMBOL(bdput); 590 591static struct block_device *bd_acquire(struct inode *inode) 592{ 593 struct block_device *bdev; 594 595 spin_lock(&bdev_lock); 596 bdev = inode->i_bdev; 597 if (bdev) { 598 ihold(bdev->bd_inode); 599 spin_unlock(&bdev_lock); 600 return bdev; 601 } 602 spin_unlock(&bdev_lock); 603 604 bdev = bdget(inode->i_rdev); 605 if (bdev) { 606 spin_lock(&bdev_lock); 607 if (!inode->i_bdev) { 608 /* 609 * We take an additional reference to bd_inode, 610 * and it's released in clear_inode() of inode. 611 * So, we can access it via ->i_mapping always 612 * without igrab(). 613 */ 614 ihold(bdev->bd_inode); 615 inode->i_bdev = bdev; 616 inode->i_mapping = bdev->bd_inode->i_mapping; 617 list_add(&inode->i_devices, &bdev->bd_inodes); 618 } 619 spin_unlock(&bdev_lock); 620 } 621 return bdev; 622} 623 624/* Call when you free inode */ 625 626void bd_forget(struct inode *inode) 627{ 628 struct block_device *bdev = NULL; 629 630 spin_lock(&bdev_lock); 631 if (inode->i_bdev) { 632 if (!sb_is_blkdev_sb(inode->i_sb)) 633 bdev = inode->i_bdev; 634 __bd_forget(inode); 635 } 636 spin_unlock(&bdev_lock); 637 638 if (bdev) 639 iput(bdev->bd_inode); 640} 641 642/** 643 * bd_may_claim - test whether a block device can be claimed 644 * @bdev: block device of interest 645 * @whole: whole block device containing @bdev, may equal @bdev 646 * @holder: holder trying to claim @bdev 647 * 648 * Test whther @bdev can be claimed by @holder. 649 * 650 * CONTEXT: 651 * spin_lock(&bdev_lock). 652 * 653 * RETURNS: 654 * %true if @bdev can be claimed, %false otherwise. 655 */ 656static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 657 void *holder) 658{ 659 if (bdev->bd_holder == holder) 660 return true; /* already a holder */ 661 else if (bdev->bd_holder != NULL) 662 return false; /* held by someone else */ 663 else if (bdev->bd_contains == bdev) 664 return true; /* is a whole device which isn't held */ 665 666 else if (whole->bd_holder == bd_claim) 667 return true; /* is a partition of a device that is being partitioned */ 668 else if (whole->bd_holder != NULL) 669 return false; /* is a partition of a held device */ 670 else 671 return true; /* is a partition of an un-held device */ 672} 673 674/** 675 * bd_prepare_to_claim - prepare to claim a block device 676 * @bdev: block device of interest 677 * @whole: the whole device containing @bdev, may equal @bdev 678 * @holder: holder trying to claim @bdev 679 * 680 * Prepare to claim @bdev. This function fails if @bdev is already 681 * claimed by another holder and waits if another claiming is in 682 * progress. This function doesn't actually claim. On successful 683 * return, the caller has ownership of bd_claiming and bd_holder[s]. 684 * 685 * CONTEXT: 686 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 687 * it multiple times. 688 * 689 * RETURNS: 690 * 0 if @bdev can be claimed, -EBUSY otherwise. 691 */ 692static int bd_prepare_to_claim(struct block_device *bdev, 693 struct block_device *whole, void *holder) 694{ 695retry: 696 /* if someone else claimed, fail */ 697 if (!bd_may_claim(bdev, whole, holder)) 698 return -EBUSY; 699 700 /* if claiming is already in progress, wait for it to finish */ 701 if (whole->bd_claiming) { 702 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 703 DEFINE_WAIT(wait); 704 705 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 706 spin_unlock(&bdev_lock); 707 schedule(); 708 finish_wait(wq, &wait); 709 spin_lock(&bdev_lock); 710 goto retry; 711 } 712 713 /* yay, all mine */ 714 return 0; 715} 716 717/** 718 * bd_start_claiming - start claiming a block device 719 * @bdev: block device of interest 720 * @holder: holder trying to claim @bdev 721 * 722 * @bdev is about to be opened exclusively. Check @bdev can be opened 723 * exclusively and mark that an exclusive open is in progress. Each 724 * successful call to this function must be matched with a call to 725 * either bd_finish_claiming() or bd_abort_claiming() (which do not 726 * fail). 727 * 728 * This function is used to gain exclusive access to the block device 729 * without actually causing other exclusive open attempts to fail. It 730 * should be used when the open sequence itself requires exclusive 731 * access but may subsequently fail. 732 * 733 * CONTEXT: 734 * Might sleep. 735 * 736 * RETURNS: 737 * Pointer to the block device containing @bdev on success, ERR_PTR() 738 * value on failure. 739 */ 740static struct block_device *bd_start_claiming(struct block_device *bdev, 741 void *holder) 742{ 743 struct gendisk *disk; 744 struct block_device *whole; 745 int partno, err; 746 747 might_sleep(); 748 749 /* 750 * @bdev might not have been initialized properly yet, look up 751 * and grab the outer block device the hard way. 752 */ 753 disk = get_gendisk(bdev->bd_dev, &partno); 754 if (!disk) 755 return ERR_PTR(-ENXIO); 756 757 whole = bdget_disk(disk, 0); 758 module_put(disk->fops->owner); 759 put_disk(disk); 760 if (!whole) 761 return ERR_PTR(-ENOMEM); 762 763 /* prepare to claim, if successful, mark claiming in progress */ 764 spin_lock(&bdev_lock); 765 766 err = bd_prepare_to_claim(bdev, whole, holder); 767 if (err == 0) { 768 whole->bd_claiming = holder; 769 spin_unlock(&bdev_lock); 770 return whole; 771 } else { 772 spin_unlock(&bdev_lock); 773 bdput(whole); 774 return ERR_PTR(err); 775 } 776} 777 778/* releases bdev_lock */ 779static void __bd_abort_claiming(struct block_device *whole, void *holder) 780{ 781 BUG_ON(whole->bd_claiming != holder); 782 whole->bd_claiming = NULL; 783 wake_up_bit(&whole->bd_claiming, 0); 784 785 spin_unlock(&bdev_lock); 786 bdput(whole); 787} 788 789/** 790 * bd_abort_claiming - abort claiming a block device 791 * @whole: whole block device returned by bd_start_claiming() 792 * @holder: holder trying to claim @bdev 793 * 794 * Abort a claiming block started by bd_start_claiming(). Note that 795 * @whole is not the block device to be claimed but the whole device 796 * returned by bd_start_claiming(). 797 * 798 * CONTEXT: 799 * Grabs and releases bdev_lock. 800 */ 801static void bd_abort_claiming(struct block_device *whole, void *holder) 802{ 803 spin_lock(&bdev_lock); 804 __bd_abort_claiming(whole, holder); /* releases bdev_lock */ 805} 806 807/* increment holders when we have a legitimate claim. requires bdev_lock */ 808static void __bd_claim(struct block_device *bdev, struct block_device *whole, 809 void *holder) 810{ 811 /* note that for a whole device bd_holders 812 * will be incremented twice, and bd_holder will 813 * be set to bd_claim before being set to holder 814 */ 815 whole->bd_holders++; 816 whole->bd_holder = bd_claim; 817 bdev->bd_holders++; 818 bdev->bd_holder = holder; 819} 820 821/** 822 * bd_finish_claiming - finish claiming a block device 823 * @bdev: block device of interest (passed to bd_start_claiming()) 824 * @whole: whole block device returned by bd_start_claiming() 825 * @holder: holder trying to claim @bdev 826 * 827 * Finish a claiming block started by bd_start_claiming(). 828 * 829 * CONTEXT: 830 * Grabs and releases bdev_lock. 831 */ 832static void bd_finish_claiming(struct block_device *bdev, 833 struct block_device *whole, void *holder) 834{ 835 spin_lock(&bdev_lock); 836 BUG_ON(!bd_may_claim(bdev, whole, holder)); 837 __bd_claim(bdev, whole, holder); 838 __bd_abort_claiming(whole, holder); /* not actually an abort */ 839} 840 841/** 842 * bd_claim - claim a block device 843 * @bdev: block device to claim 844 * @holder: holder trying to claim @bdev 845 * 846 * Try to claim @bdev which must have been opened successfully. 847 * 848 * CONTEXT: 849 * Might sleep. 850 * 851 * RETURNS: 852 * 0 if successful, -EBUSY if @bdev is already claimed. 853 */ 854int bd_claim(struct block_device *bdev, void *holder) 855{ 856 struct block_device *whole = bdev->bd_contains; 857 int res; 858 859 might_sleep(); 860 861 spin_lock(&bdev_lock); 862 res = bd_prepare_to_claim(bdev, whole, holder); 863 if (res == 0) 864 __bd_claim(bdev, whole, holder); 865 spin_unlock(&bdev_lock); 866 867 return res; 868} 869EXPORT_SYMBOL(bd_claim); 870 871void bd_release(struct block_device *bdev) 872{ 873 spin_lock(&bdev_lock); 874 if (!--bdev->bd_contains->bd_holders) 875 bdev->bd_contains->bd_holder = NULL; 876 if (!--bdev->bd_holders) 877 bdev->bd_holder = NULL; 878 spin_unlock(&bdev_lock); 879} 880 881EXPORT_SYMBOL(bd_release); 882 883#ifdef CONFIG_SYSFS 884/* 885 * Functions for bd_claim_by_kobject / bd_release_from_kobject 886 * 887 * If a kobject is passed to bd_claim_by_kobject() 888 * and the kobject has a parent directory, 889 * following symlinks are created: 890 * o from the kobject to the claimed bdev 891 * o from "holders" directory of the bdev to the parent of the kobject 892 * bd_release_from_kobject() removes these symlinks. 893 * 894 * Example: 895 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 896 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 897 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 898 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 899 */ 900 901static int add_symlink(struct kobject *from, struct kobject *to) 902{ 903 if (!from || !to) 904 return 0; 905 return sysfs_create_link(from, to, kobject_name(to)); 906} 907 908static void del_symlink(struct kobject *from, struct kobject *to) 909{ 910 if (!from || !to) 911 return; 912 sysfs_remove_link(from, kobject_name(to)); 913} 914 915/* 916 * 'struct bd_holder' contains pointers to kobjects symlinked by 917 * bd_claim_by_kobject. 918 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 919 */ 920struct bd_holder { 921 struct list_head list; /* chain of holders of the bdev */ 922 int count; /* references from the holder */ 923 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 924 struct kobject *hdev; /* e.g. "/block/dm-0" */ 925 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 926 struct kobject *sdev; /* e.g. "/block/sda" */ 927}; 928 929/* 930 * Get references of related kobjects at once. 931 * Returns 1 on success. 0 on failure. 932 * 933 * Should call bd_holder_release_dirs() after successful use. 934 */ 935static int bd_holder_grab_dirs(struct block_device *bdev, 936 struct bd_holder *bo) 937{ 938 if (!bdev || !bo) 939 return 0; 940 941 bo->sdir = kobject_get(bo->sdir); 942 if (!bo->sdir) 943 return 0; 944 945 bo->hdev = kobject_get(bo->sdir->parent); 946 if (!bo->hdev) 947 goto fail_put_sdir; 948 949 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); 950 if (!bo->sdev) 951 goto fail_put_hdev; 952 953 bo->hdir = kobject_get(bdev->bd_part->holder_dir); 954 if (!bo->hdir) 955 goto fail_put_sdev; 956 957 return 1; 958 959fail_put_sdev: 960 kobject_put(bo->sdev); 961fail_put_hdev: 962 kobject_put(bo->hdev); 963fail_put_sdir: 964 kobject_put(bo->sdir); 965 966 return 0; 967} 968 969/* Put references of related kobjects at once. */ 970static void bd_holder_release_dirs(struct bd_holder *bo) 971{ 972 kobject_put(bo->hdir); 973 kobject_put(bo->sdev); 974 kobject_put(bo->hdev); 975 kobject_put(bo->sdir); 976} 977 978static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 979{ 980 struct bd_holder *bo; 981 982 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 983 if (!bo) 984 return NULL; 985 986 bo->count = 1; 987 bo->sdir = kobj; 988 989 return bo; 990} 991 992static void free_bd_holder(struct bd_holder *bo) 993{ 994 kfree(bo); 995} 996 997/** 998 * find_bd_holder - find matching struct bd_holder from the block device 999 * 1000 * @bdev: struct block device to be searched 1001 * @bo: target struct bd_holder 1002 * 1003 * Returns matching entry with @bo in @bdev->bd_holder_list. 1004 * If found, increment the reference count and return the pointer. 1005 * If not found, returns NULL. 1006 */ 1007static struct bd_holder *find_bd_holder(struct block_device *bdev, 1008 struct bd_holder *bo) 1009{ 1010 struct bd_holder *tmp; 1011 1012 list_for_each_entry(tmp, &bdev->bd_holder_list, list) 1013 if (tmp->sdir == bo->sdir) { 1014 tmp->count++; 1015 return tmp; 1016 } 1017 1018 return NULL; 1019} 1020 1021/** 1022 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 1023 * 1024 * @bdev: block device to be bd_claimed 1025 * @bo: preallocated and initialized by alloc_bd_holder() 1026 * 1027 * Add @bo to @bdev->bd_holder_list, create symlinks. 1028 * 1029 * Returns 0 if symlinks are created. 1030 * Returns -ve if something fails. 1031 */ 1032static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 1033{ 1034 int err; 1035 1036 if (!bo) 1037 return -EINVAL; 1038 1039 if (!bd_holder_grab_dirs(bdev, bo)) 1040 return -EBUSY; 1041 1042 err = add_symlink(bo->sdir, bo->sdev); 1043 if (err) 1044 return err; 1045 1046 err = add_symlink(bo->hdir, bo->hdev); 1047 if (err) { 1048 del_symlink(bo->sdir, bo->sdev); 1049 return err; 1050 } 1051 1052 list_add_tail(&bo->list, &bdev->bd_holder_list); 1053 return 0; 1054} 1055 1056/** 1057 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 1058 * 1059 * @bdev: block device to be bd_claimed 1060 * @kobj: holder's kobject 1061 * 1062 * If there is matching entry with @kobj in @bdev->bd_holder_list 1063 * and no other bd_claim() from the same kobject, 1064 * remove the struct bd_holder from the list, delete symlinks for it. 1065 * 1066 * Returns a pointer to the struct bd_holder when it's removed from the list 1067 * and ready to be freed. 1068 * Returns NULL if matching claim isn't found or there is other bd_claim() 1069 * by the same kobject. 1070 */ 1071static struct bd_holder *del_bd_holder(struct block_device *bdev, 1072 struct kobject *kobj) 1073{ 1074 struct bd_holder *bo; 1075 1076 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 1077 if (bo->sdir == kobj) { 1078 bo->count--; 1079 BUG_ON(bo->count < 0); 1080 if (!bo->count) { 1081 list_del(&bo->list); 1082 del_symlink(bo->sdir, bo->sdev); 1083 del_symlink(bo->hdir, bo->hdev); 1084 bd_holder_release_dirs(bo); 1085 return bo; 1086 } 1087 break; 1088 } 1089 } 1090 1091 return NULL; 1092} 1093 1094/** 1095 * bd_claim_by_kobject - bd_claim() with additional kobject signature 1096 * 1097 * @bdev: block device to be claimed 1098 * @holder: holder's signature 1099 * @kobj: holder's kobject 1100 * 1101 * Do bd_claim() and if it succeeds, create sysfs symlinks between 1102 * the bdev and the holder's kobject. 1103 * Use bd_release_from_kobject() when relesing the claimed bdev. 1104 * 1105 * Returns 0 on success. (same as bd_claim()) 1106 * Returns errno on failure. 1107 */ 1108static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 1109 struct kobject *kobj) 1110{ 1111 int err; 1112 struct bd_holder *bo, *found; 1113 1114 if (!kobj) 1115 return -EINVAL; 1116 1117 bo = alloc_bd_holder(kobj); 1118 if (!bo) 1119 return -ENOMEM; 1120 1121 mutex_lock(&bdev->bd_mutex); 1122 1123 err = bd_claim(bdev, holder); 1124 if (err) 1125 goto fail; 1126 1127 found = find_bd_holder(bdev, bo); 1128 if (found) 1129 goto fail; 1130 1131 err = add_bd_holder(bdev, bo); 1132 if (err) 1133 bd_release(bdev); 1134 else 1135 bo = NULL; 1136fail: 1137 mutex_unlock(&bdev->bd_mutex); 1138 free_bd_holder(bo); 1139 return err; 1140} 1141 1142/** 1143 * bd_release_from_kobject - bd_release() with additional kobject signature 1144 * 1145 * @bdev: block device to be released 1146 * @kobj: holder's kobject 1147 * 1148 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 1149 */ 1150static void bd_release_from_kobject(struct block_device *bdev, 1151 struct kobject *kobj) 1152{ 1153 if (!kobj) 1154 return; 1155 1156 mutex_lock(&bdev->bd_mutex); 1157 bd_release(bdev); 1158 free_bd_holder(del_bd_holder(bdev, kobj)); 1159 mutex_unlock(&bdev->bd_mutex); 1160} 1161 1162/** 1163 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 1164 * 1165 * @bdev: block device to be claimed 1166 * @holder: holder's signature 1167 * @disk: holder's gendisk 1168 * 1169 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 1170 */ 1171int bd_claim_by_disk(struct block_device *bdev, void *holder, 1172 struct gendisk *disk) 1173{ 1174 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 1175} 1176EXPORT_SYMBOL_GPL(bd_claim_by_disk); 1177 1178/** 1179 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 1180 * 1181 * @bdev: block device to be claimed 1182 * @disk: holder's gendisk 1183 * 1184 * Call bd_release_from_kobject() and put @disk->slave_dir. 1185 */ 1186void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 1187{ 1188 bd_release_from_kobject(bdev, disk->slave_dir); 1189 kobject_put(disk->slave_dir); 1190} 1191EXPORT_SYMBOL_GPL(bd_release_from_disk); 1192#endif 1193 1194/* 1195 * Tries to open block device by device number. Use it ONLY if you 1196 * really do not have anything better - i.e. when you are behind a 1197 * truly sucky interface and all you are given is a device number. _Never_ 1198 * to be used for internal purposes. If you ever need it - reconsider 1199 * your API. 1200 */ 1201struct block_device *open_by_devnum(dev_t dev, fmode_t mode) 1202{ 1203 struct block_device *bdev = bdget(dev); 1204 int err = -ENOMEM; 1205 if (bdev) 1206 err = blkdev_get(bdev, mode); 1207 return err ? ERR_PTR(err) : bdev; 1208} 1209 1210EXPORT_SYMBOL(open_by_devnum); 1211 1212/** 1213 * flush_disk - invalidates all buffer-cache entries on a disk 1214 * 1215 * @bdev: struct block device to be flushed 1216 * 1217 * Invalidates all buffer-cache entries on a disk. It should be called 1218 * when a disk has been changed -- either by a media change or online 1219 * resize. 1220 */ 1221static void flush_disk(struct block_device *bdev) 1222{ 1223 if (__invalidate_device(bdev)) { 1224 char name[BDEVNAME_SIZE] = ""; 1225 1226 if (bdev->bd_disk) 1227 disk_name(bdev->bd_disk, 0, name); 1228 printk(KERN_WARNING "VFS: busy inodes on changed media or " 1229 "resized disk %s\n", name); 1230 } 1231 1232 if (!bdev->bd_disk) 1233 return; 1234 if (disk_partitionable(bdev->bd_disk)) 1235 bdev->bd_invalidated = 1; 1236} 1237 1238/** 1239 * check_disk_size_change - checks for disk size change and adjusts bdev size. 1240 * @disk: struct gendisk to check 1241 * @bdev: struct bdev to adjust. 1242 * 1243 * This routine checks to see if the bdev size does not match the disk size 1244 * and adjusts it if it differs. 1245 */ 1246void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1247{ 1248 loff_t disk_size, bdev_size; 1249 1250 disk_size = (loff_t)get_capacity(disk) << 9; 1251 bdev_size = i_size_read(bdev->bd_inode); 1252 if (disk_size != bdev_size) { 1253 char name[BDEVNAME_SIZE]; 1254 1255 disk_name(disk, 0, name); 1256 printk(KERN_INFO 1257 "%s: detected capacity change from %lld to %lld\n", 1258 name, bdev_size, disk_size); 1259 i_size_write(bdev->bd_inode, disk_size); 1260 flush_disk(bdev); 1261 } 1262} 1263EXPORT_SYMBOL(check_disk_size_change); 1264 1265/** 1266 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1267 * @disk: struct gendisk to be revalidated 1268 * 1269 * This routine is a wrapper for lower-level driver's revalidate_disk 1270 * call-backs. It is used to do common pre and post operations needed 1271 * for all revalidate_disk operations. 1272 */ 1273int revalidate_disk(struct gendisk *disk) 1274{ 1275 struct block_device *bdev; 1276 int ret = 0; 1277 1278 if (disk->fops->revalidate_disk) 1279 ret = disk->fops->revalidate_disk(disk); 1280 1281 bdev = bdget_disk(disk, 0); 1282 if (!bdev) 1283 return ret; 1284 1285 mutex_lock(&bdev->bd_mutex); 1286 check_disk_size_change(disk, bdev); 1287 mutex_unlock(&bdev->bd_mutex); 1288 bdput(bdev); 1289 return ret; 1290} 1291EXPORT_SYMBOL(revalidate_disk); 1292 1293/* 1294 * This routine checks whether a removable media has been changed, 1295 * and invalidates all buffer-cache-entries in that case. This 1296 * is a relatively slow routine, so we have to try to minimize using 1297 * it. Thus it is called only upon a 'mount' or 'open'. This 1298 * is the best way of combining speed and utility, I think. 1299 * People changing diskettes in the middle of an operation deserve 1300 * to lose :-) 1301 */ 1302int check_disk_change(struct block_device *bdev) 1303{ 1304 struct gendisk *disk = bdev->bd_disk; 1305 const struct block_device_operations *bdops = disk->fops; 1306 1307 if (!bdops->media_changed) 1308 return 0; 1309 if (!bdops->media_changed(bdev->bd_disk)) 1310 return 0; 1311 1312 flush_disk(bdev); 1313 if (bdops->revalidate_disk) 1314 bdops->revalidate_disk(bdev->bd_disk); 1315 return 1; 1316} 1317 1318EXPORT_SYMBOL(check_disk_change); 1319 1320void bd_set_size(struct block_device *bdev, loff_t size) 1321{ 1322 unsigned bsize = bdev_logical_block_size(bdev); 1323 1324 bdev->bd_inode->i_size = size; 1325 while (bsize < PAGE_CACHE_SIZE) { 1326 if (size & bsize) 1327 break; 1328 bsize <<= 1; 1329 } 1330 bdev->bd_block_size = bsize; 1331 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1332} 1333EXPORT_SYMBOL(bd_set_size); 1334 1335static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1336 1337/* 1338 * bd_mutex locking: 1339 * 1340 * mutex_lock(part->bd_mutex) 1341 * mutex_lock_nested(whole->bd_mutex, 1) 1342 */ 1343 1344static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1345{ 1346 struct gendisk *disk; 1347 int ret; 1348 int partno; 1349 int perm = 0; 1350 1351 if (mode & FMODE_READ) 1352 perm |= MAY_READ; 1353 if (mode & FMODE_WRITE) 1354 perm |= MAY_WRITE; 1355 /* 1356 * hooks: /n/, see "layering violations". 1357 */ 1358 if (!for_part) { 1359 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1360 if (ret != 0) { 1361 bdput(bdev); 1362 return ret; 1363 } 1364 } 1365 1366 restart: 1367 1368 ret = -ENXIO; 1369 disk = get_gendisk(bdev->bd_dev, &partno); 1370 if (!disk) 1371 goto out; 1372 1373 mutex_lock_nested(&bdev->bd_mutex, for_part); 1374 if (!bdev->bd_openers) { 1375 bdev->bd_disk = disk; 1376 bdev->bd_contains = bdev; 1377 if (!partno) { 1378 struct backing_dev_info *bdi; 1379 1380 ret = -ENXIO; 1381 bdev->bd_part = disk_get_part(disk, partno); 1382 if (!bdev->bd_part) 1383 goto out_clear; 1384 1385 if (disk->fops->open) { 1386 ret = disk->fops->open(bdev, mode); 1387 if (ret == -ERESTARTSYS) { 1388 /* Lost a race with 'disk' being 1389 * deleted, try again. 1390 * See md.c 1391 */ 1392 disk_put_part(bdev->bd_part); 1393 bdev->bd_part = NULL; 1394 module_put(disk->fops->owner); 1395 put_disk(disk); 1396 bdev->bd_disk = NULL; 1397 mutex_unlock(&bdev->bd_mutex); 1398 goto restart; 1399 } 1400 if (ret) 1401 goto out_clear; 1402 } 1403 if (!bdev->bd_openers) { 1404 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1405 bdi = blk_get_backing_dev_info(bdev); 1406 if (bdi == NULL) 1407 bdi = &default_backing_dev_info; 1408 bdev_inode_switch_bdi(bdev->bd_inode, bdi); 1409 } 1410 if (bdev->bd_invalidated) 1411 rescan_partitions(disk, bdev); 1412 } else { 1413 struct block_device *whole; 1414 whole = bdget_disk(disk, 0); 1415 ret = -ENOMEM; 1416 if (!whole) 1417 goto out_clear; 1418 BUG_ON(for_part); 1419 ret = __blkdev_get(whole, mode, 1); 1420 if (ret) 1421 goto out_clear; 1422 bdev->bd_contains = whole; 1423 bdev_inode_switch_bdi(bdev->bd_inode, 1424 whole->bd_inode->i_data.backing_dev_info); 1425 bdev->bd_part = disk_get_part(disk, partno); 1426 if (!(disk->flags & GENHD_FL_UP) || 1427 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1428 ret = -ENXIO; 1429 goto out_clear; 1430 } 1431 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1432 } 1433 } else { 1434 module_put(disk->fops->owner); 1435 put_disk(disk); 1436 disk = NULL; 1437 if (bdev->bd_contains == bdev) { 1438 if (bdev->bd_disk->fops->open) { 1439 ret = bdev->bd_disk->fops->open(bdev, mode); 1440 if (ret) 1441 goto out_unlock_bdev; 1442 } 1443 if (bdev->bd_invalidated) 1444 rescan_partitions(bdev->bd_disk, bdev); 1445 } 1446 } 1447 bdev->bd_openers++; 1448 if (for_part) 1449 bdev->bd_part_count++; 1450 mutex_unlock(&bdev->bd_mutex); 1451 return 0; 1452 1453 out_clear: 1454 disk_put_part(bdev->bd_part); 1455 bdev->bd_disk = NULL; 1456 bdev->bd_part = NULL; 1457 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1458 if (bdev != bdev->bd_contains) 1459 __blkdev_put(bdev->bd_contains, mode, 1); 1460 bdev->bd_contains = NULL; 1461 out_unlock_bdev: 1462 mutex_unlock(&bdev->bd_mutex); 1463 out: 1464 if (disk) 1465 module_put(disk->fops->owner); 1466 put_disk(disk); 1467 bdput(bdev); 1468 1469 return ret; 1470} 1471 1472int blkdev_get(struct block_device *bdev, fmode_t mode) 1473{ 1474 return __blkdev_get(bdev, mode, 0); 1475} 1476EXPORT_SYMBOL(blkdev_get); 1477 1478static int blkdev_open(struct inode * inode, struct file * filp) 1479{ 1480 struct block_device *whole = NULL; 1481 struct block_device *bdev; 1482 int res; 1483 1484 /* 1485 * Preserve backwards compatibility and allow large file access 1486 * even if userspace doesn't ask for it explicitly. Some mkfs 1487 * binary needs it. We might want to drop this workaround 1488 * during an unstable branch. 1489 */ 1490 filp->f_flags |= O_LARGEFILE; 1491 1492 if (filp->f_flags & O_NDELAY) 1493 filp->f_mode |= FMODE_NDELAY; 1494 if (filp->f_flags & O_EXCL) 1495 filp->f_mode |= FMODE_EXCL; 1496 if ((filp->f_flags & O_ACCMODE) == 3) 1497 filp->f_mode |= FMODE_WRITE_IOCTL; 1498 1499 bdev = bd_acquire(inode); 1500 if (bdev == NULL) 1501 return -ENOMEM; 1502 1503 if (filp->f_mode & FMODE_EXCL) { 1504 whole = bd_start_claiming(bdev, filp); 1505 if (IS_ERR(whole)) { 1506 bdput(bdev); 1507 return PTR_ERR(whole); 1508 } 1509 } 1510 1511 filp->f_mapping = bdev->bd_inode->i_mapping; 1512 1513 res = blkdev_get(bdev, filp->f_mode); 1514 1515 if (whole) { 1516 if (res == 0) 1517 bd_finish_claiming(bdev, whole, filp); 1518 else 1519 bd_abort_claiming(whole, filp); 1520 } 1521 1522 return res; 1523} 1524 1525static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1526{ 1527 int ret = 0; 1528 struct gendisk *disk = bdev->bd_disk; 1529 struct block_device *victim = NULL; 1530 1531 mutex_lock_nested(&bdev->bd_mutex, for_part); 1532 if (for_part) 1533 bdev->bd_part_count--; 1534 1535 if (!--bdev->bd_openers) { 1536 sync_blockdev(bdev); 1537 kill_bdev(bdev); 1538 } 1539 if (bdev->bd_contains == bdev) { 1540 if (disk->fops->release) 1541 ret = disk->fops->release(disk, mode); 1542 } 1543 if (!bdev->bd_openers) { 1544 struct module *owner = disk->fops->owner; 1545 1546 put_disk(disk); 1547 module_put(owner); 1548 disk_put_part(bdev->bd_part); 1549 bdev->bd_part = NULL; 1550 bdev->bd_disk = NULL; 1551 bdev_inode_switch_bdi(bdev->bd_inode, 1552 &default_backing_dev_info); 1553 if (bdev != bdev->bd_contains) 1554 victim = bdev->bd_contains; 1555 bdev->bd_contains = NULL; 1556 } 1557 mutex_unlock(&bdev->bd_mutex); 1558 bdput(bdev); 1559 if (victim) 1560 __blkdev_put(victim, mode, 1); 1561 return ret; 1562} 1563 1564int blkdev_put(struct block_device *bdev, fmode_t mode) 1565{ 1566 return __blkdev_put(bdev, mode, 0); 1567} 1568EXPORT_SYMBOL(blkdev_put); 1569 1570static int blkdev_close(struct inode * inode, struct file * filp) 1571{ 1572 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1573 if (bdev->bd_holder == filp) 1574 bd_release(bdev); 1575 return blkdev_put(bdev, filp->f_mode); 1576} 1577 1578static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1579{ 1580 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1581 fmode_t mode = file->f_mode; 1582 1583 /* 1584 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1585 * to updated it before every ioctl. 1586 */ 1587 if (file->f_flags & O_NDELAY) 1588 mode |= FMODE_NDELAY; 1589 else 1590 mode &= ~FMODE_NDELAY; 1591 1592 return blkdev_ioctl(bdev, mode, cmd, arg); 1593} 1594 1595/* 1596 * Write data to the block device. Only intended for the block device itself 1597 * and the raw driver which basically is a fake block device. 1598 * 1599 * Does not take i_mutex for the write and thus is not for general purpose 1600 * use. 1601 */ 1602ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 1603 unsigned long nr_segs, loff_t pos) 1604{ 1605 struct file *file = iocb->ki_filp; 1606 ssize_t ret; 1607 1608 BUG_ON(iocb->ki_pos != pos); 1609 1610 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1611 if (ret > 0 || ret == -EIOCBQUEUED) { 1612 ssize_t err; 1613 1614 err = generic_write_sync(file, pos, ret); 1615 if (err < 0 && ret > 0) 1616 ret = err; 1617 } 1618 return ret; 1619} 1620EXPORT_SYMBOL_GPL(blkdev_aio_write); 1621 1622/* 1623 * Try to release a page associated with block device when the system 1624 * is under memory pressure. 1625 */ 1626static int blkdev_releasepage(struct page *page, gfp_t wait) 1627{ 1628 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1629 1630 if (super && super->s_op->bdev_try_to_free_page) 1631 return super->s_op->bdev_try_to_free_page(super, page, wait); 1632 1633 return try_to_free_buffers(page); 1634} 1635 1636static const struct address_space_operations def_blk_aops = { 1637 .readpage = blkdev_readpage, 1638 .writepage = blkdev_writepage, 1639 .sync_page = block_sync_page, 1640 .write_begin = blkdev_write_begin, 1641 .write_end = blkdev_write_end, 1642 .writepages = generic_writepages, 1643 .releasepage = blkdev_releasepage, 1644 .direct_IO = blkdev_direct_IO, 1645}; 1646 1647const struct file_operations def_blk_fops = { 1648 .open = blkdev_open, 1649 .release = blkdev_close, 1650 .llseek = block_llseek, 1651 .read = do_sync_read, 1652 .write = do_sync_write, 1653 .aio_read = generic_file_aio_read, 1654 .aio_write = blkdev_aio_write, 1655 .mmap = generic_file_mmap, 1656 .fsync = blkdev_fsync, 1657 .unlocked_ioctl = block_ioctl, 1658#ifdef CONFIG_COMPAT 1659 .compat_ioctl = compat_blkdev_ioctl, 1660#endif 1661 .splice_read = generic_file_splice_read, 1662 .splice_write = generic_file_splice_write, 1663}; 1664 1665int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1666{ 1667 int res; 1668 mm_segment_t old_fs = get_fs(); 1669 set_fs(KERNEL_DS); 1670 res = blkdev_ioctl(bdev, 0, cmd, arg); 1671 set_fs(old_fs); 1672 return res; 1673} 1674 1675EXPORT_SYMBOL(ioctl_by_bdev); 1676 1677/** 1678 * lookup_bdev - lookup a struct block_device by name 1679 * @pathname: special file representing the block device 1680 * 1681 * Get a reference to the blockdevice at @pathname in the current 1682 * namespace if possible and return it. Return ERR_PTR(error) 1683 * otherwise. 1684 */ 1685struct block_device *lookup_bdev(const char *pathname) 1686{ 1687 struct block_device *bdev; 1688 struct inode *inode; 1689 struct path path; 1690 int error; 1691 1692 if (!pathname || !*pathname) 1693 return ERR_PTR(-EINVAL); 1694 1695 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1696 if (error) 1697 return ERR_PTR(error); 1698 1699 inode = path.dentry->d_inode; 1700 error = -ENOTBLK; 1701 if (!S_ISBLK(inode->i_mode)) 1702 goto fail; 1703 error = -EACCES; 1704 if (path.mnt->mnt_flags & MNT_NODEV) 1705 goto fail; 1706 error = -ENOMEM; 1707 bdev = bd_acquire(inode); 1708 if (!bdev) 1709 goto fail; 1710out: 1711 path_put(&path); 1712 return bdev; 1713fail: 1714 bdev = ERR_PTR(error); 1715 goto out; 1716} 1717EXPORT_SYMBOL(lookup_bdev); 1718 1719/** 1720 * open_bdev_exclusive - open a block device by name and set it up for use 1721 * 1722 * @path: special file representing the block device 1723 * @mode: FMODE_... combination to pass be used 1724 * @holder: owner for exclusion 1725 * 1726 * Open the blockdevice described by the special file at @path, claim it 1727 * for the @holder. 1728 */ 1729struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1730{ 1731 struct block_device *bdev, *whole; 1732 int error; 1733 1734 bdev = lookup_bdev(path); 1735 if (IS_ERR(bdev)) 1736 return bdev; 1737 1738 whole = bd_start_claiming(bdev, holder); 1739 if (IS_ERR(whole)) { 1740 bdput(bdev); 1741 return whole; 1742 } 1743 1744 error = blkdev_get(bdev, mode); 1745 if (error) 1746 goto out_abort_claiming; 1747 1748 error = -EACCES; 1749 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1750 goto out_blkdev_put; 1751 1752 bd_finish_claiming(bdev, whole, holder); 1753 return bdev; 1754 1755out_blkdev_put: 1756 blkdev_put(bdev, mode); 1757out_abort_claiming: 1758 bd_abort_claiming(whole, holder); 1759 return ERR_PTR(error); 1760} 1761 1762EXPORT_SYMBOL(open_bdev_exclusive); 1763 1764/** 1765 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() 1766 * 1767 * @bdev: blockdevice to close 1768 * @mode: mode, must match that used to open. 1769 * 1770 * This is the counterpart to open_bdev_exclusive(). 1771 */ 1772void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) 1773{ 1774 bd_release(bdev); 1775 blkdev_put(bdev, mode); 1776} 1777 1778EXPORT_SYMBOL(close_bdev_exclusive); 1779 1780int __invalidate_device(struct block_device *bdev) 1781{ 1782 struct super_block *sb = get_super(bdev); 1783 int res = 0; 1784 1785 if (sb) { 1786 /* 1787 * no need to lock the super, get_super holds the 1788 * read mutex so the filesystem cannot go away 1789 * under us (->put_super runs with the write lock 1790 * hold). 1791 */ 1792 shrink_dcache_sb(sb); 1793 res = invalidate_inodes(sb); 1794 drop_super(sb); 1795 } 1796 invalidate_bdev(bdev); 1797 return res; 1798} 1799EXPORT_SYMBOL(__invalidate_device);