at v2.6.19-rc2 1320 lines 32 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/smp_lock.h> 15#include <linux/highmem.h> 16#include <linux/blkdev.h> 17#include <linux/module.h> 18#include <linux/blkpg.h> 19#include <linux/buffer_head.h> 20#include <linux/writeback.h> 21#include <linux/mpage.h> 22#include <linux/mount.h> 23#include <linux/uio.h> 24#include <linux/namei.h> 25#include <asm/uaccess.h> 26#include "internal.h" 27 28struct bdev_inode { 29 struct block_device bdev; 30 struct inode vfs_inode; 31}; 32 33static inline struct bdev_inode *BDEV_I(struct inode *inode) 34{ 35 return container_of(inode, struct bdev_inode, vfs_inode); 36} 37 38inline struct block_device *I_BDEV(struct inode *inode) 39{ 40 return &BDEV_I(inode)->bdev; 41} 42 43EXPORT_SYMBOL(I_BDEV); 44 45static sector_t max_block(struct block_device *bdev) 46{ 47 sector_t retval = ~((sector_t)0); 48 loff_t sz = i_size_read(bdev->bd_inode); 49 50 if (sz) { 51 unsigned int size = block_size(bdev); 52 unsigned int sizebits = blksize_bits(size); 53 retval = (sz >> sizebits); 54 } 55 return retval; 56} 57 58/* Kill _all_ buffers, dirty or not.. */ 59static void kill_bdev(struct block_device *bdev) 60{ 61 invalidate_bdev(bdev, 1); 62 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 63} 64 65int set_blocksize(struct block_device *bdev, int size) 66{ 67 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 68 if (size > PAGE_SIZE || size < 512 || (size & (size-1))) 69 return -EINVAL; 70 71 /* Size cannot be smaller than the size supported by the device */ 72 if (size < bdev_hardsect_size(bdev)) 73 return -EINVAL; 74 75 /* Don't change the size if it is same as current */ 76 if (bdev->bd_block_size != size) { 77 sync_blockdev(bdev); 78 bdev->bd_block_size = size; 79 bdev->bd_inode->i_blkbits = blksize_bits(size); 80 kill_bdev(bdev); 81 } 82 return 0; 83} 84 85EXPORT_SYMBOL(set_blocksize); 86 87int sb_set_blocksize(struct super_block *sb, int size) 88{ 89 if (set_blocksize(sb->s_bdev, size)) 90 return 0; 91 /* If we get here, we know size is power of two 92 * and it's value is between 512 and PAGE_SIZE */ 93 sb->s_blocksize = size; 94 sb->s_blocksize_bits = blksize_bits(size); 95 return sb->s_blocksize; 96} 97 98EXPORT_SYMBOL(sb_set_blocksize); 99 100int sb_min_blocksize(struct super_block *sb, int size) 101{ 102 int minsize = bdev_hardsect_size(sb->s_bdev); 103 if (size < minsize) 104 size = minsize; 105 return sb_set_blocksize(sb, size); 106} 107 108EXPORT_SYMBOL(sb_min_blocksize); 109 110static int 111blkdev_get_block(struct inode *inode, sector_t iblock, 112 struct buffer_head *bh, int create) 113{ 114 if (iblock >= max_block(I_BDEV(inode))) { 115 if (create) 116 return -EIO; 117 118 /* 119 * for reads, we're just trying to fill a partial page. 120 * return a hole, they will have to call get_block again 121 * before they can fill it, and they will get -EIO at that 122 * time 123 */ 124 return 0; 125 } 126 bh->b_bdev = I_BDEV(inode); 127 bh->b_blocknr = iblock; 128 set_buffer_mapped(bh); 129 return 0; 130} 131 132static int 133blkdev_get_blocks(struct inode *inode, sector_t iblock, 134 struct buffer_head *bh, int create) 135{ 136 sector_t end_block = max_block(I_BDEV(inode)); 137 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 138 139 if ((iblock + max_blocks) > end_block) { 140 max_blocks = end_block - iblock; 141 if ((long)max_blocks <= 0) { 142 if (create) 143 return -EIO; /* write fully beyond EOF */ 144 /* 145 * It is a read which is fully beyond EOF. We return 146 * a !buffer_mapped buffer 147 */ 148 max_blocks = 0; 149 } 150 } 151 152 bh->b_bdev = I_BDEV(inode); 153 bh->b_blocknr = iblock; 154 bh->b_size = max_blocks << inode->i_blkbits; 155 if (max_blocks) 156 set_buffer_mapped(bh); 157 return 0; 158} 159 160static ssize_t 161blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 162 loff_t offset, unsigned long nr_segs) 163{ 164 struct file *file = iocb->ki_filp; 165 struct inode *inode = file->f_mapping->host; 166 167 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 168 iov, offset, nr_segs, blkdev_get_blocks, NULL); 169} 170 171static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 172{ 173 return block_write_full_page(page, blkdev_get_block, wbc); 174} 175 176static int blkdev_readpage(struct file * file, struct page * page) 177{ 178 return block_read_full_page(page, blkdev_get_block); 179} 180 181static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) 182{ 183 return block_prepare_write(page, from, to, blkdev_get_block); 184} 185 186static int blkdev_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) 187{ 188 return block_commit_write(page, from, to); 189} 190 191/* 192 * private llseek: 193 * for a block special file file->f_dentry->d_inode->i_size is zero 194 * so we compute the size by hand (just as in block_read/write above) 195 */ 196static loff_t block_llseek(struct file *file, loff_t offset, int origin) 197{ 198 struct inode *bd_inode = file->f_mapping->host; 199 loff_t size; 200 loff_t retval; 201 202 mutex_lock(&bd_inode->i_mutex); 203 size = i_size_read(bd_inode); 204 205 switch (origin) { 206 case 2: 207 offset += size; 208 break; 209 case 1: 210 offset += file->f_pos; 211 } 212 retval = -EINVAL; 213 if (offset >= 0 && offset <= size) { 214 if (offset != file->f_pos) { 215 file->f_pos = offset; 216 } 217 retval = offset; 218 } 219 mutex_unlock(&bd_inode->i_mutex); 220 return retval; 221} 222 223/* 224 * Filp is never NULL; the only case when ->fsync() is called with 225 * NULL first argument is nfsd_sync_dir() and that's not a directory. 226 */ 227 228static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 229{ 230 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 231} 232 233/* 234 * pseudo-fs 235 */ 236 237static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 238static kmem_cache_t * bdev_cachep __read_mostly; 239 240static struct inode *bdev_alloc_inode(struct super_block *sb) 241{ 242 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL); 243 if (!ei) 244 return NULL; 245 return &ei->vfs_inode; 246} 247 248static void bdev_destroy_inode(struct inode *inode) 249{ 250 struct bdev_inode *bdi = BDEV_I(inode); 251 252 bdi->bdev.bd_inode_backing_dev_info = NULL; 253 kmem_cache_free(bdev_cachep, bdi); 254} 255 256static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) 257{ 258 struct bdev_inode *ei = (struct bdev_inode *) foo; 259 struct block_device *bdev = &ei->bdev; 260 261 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 262 SLAB_CTOR_CONSTRUCTOR) 263 { 264 memset(bdev, 0, sizeof(*bdev)); 265 mutex_init(&bdev->bd_mutex); 266 mutex_init(&bdev->bd_mount_mutex); 267 INIT_LIST_HEAD(&bdev->bd_inodes); 268 INIT_LIST_HEAD(&bdev->bd_list); 269#ifdef CONFIG_SYSFS 270 INIT_LIST_HEAD(&bdev->bd_holder_list); 271#endif 272 inode_init_once(&ei->vfs_inode); 273 } 274} 275 276static inline void __bd_forget(struct inode *inode) 277{ 278 list_del_init(&inode->i_devices); 279 inode->i_bdev = NULL; 280 inode->i_mapping = &inode->i_data; 281} 282 283static void bdev_clear_inode(struct inode *inode) 284{ 285 struct block_device *bdev = &BDEV_I(inode)->bdev; 286 struct list_head *p; 287 spin_lock(&bdev_lock); 288 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 289 __bd_forget(list_entry(p, struct inode, i_devices)); 290 } 291 list_del_init(&bdev->bd_list); 292 spin_unlock(&bdev_lock); 293} 294 295static struct super_operations bdev_sops = { 296 .statfs = simple_statfs, 297 .alloc_inode = bdev_alloc_inode, 298 .destroy_inode = bdev_destroy_inode, 299 .drop_inode = generic_delete_inode, 300 .clear_inode = bdev_clear_inode, 301}; 302 303static int bd_get_sb(struct file_system_type *fs_type, 304 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 305{ 306 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 307} 308 309static struct file_system_type bd_type = { 310 .name = "bdev", 311 .get_sb = bd_get_sb, 312 .kill_sb = kill_anon_super, 313}; 314 315static struct vfsmount *bd_mnt __read_mostly; 316struct super_block *blockdev_superblock; 317 318void __init bdev_cache_init(void) 319{ 320 int err; 321 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 322 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 323 SLAB_MEM_SPREAD|SLAB_PANIC), 324 init_once, NULL); 325 err = register_filesystem(&bd_type); 326 if (err) 327 panic("Cannot register bdev pseudo-fs"); 328 bd_mnt = kern_mount(&bd_type); 329 err = PTR_ERR(bd_mnt); 330 if (IS_ERR(bd_mnt)) 331 panic("Cannot create bdev pseudo-fs"); 332 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 333} 334 335/* 336 * Most likely _very_ bad one - but then it's hardly critical for small 337 * /dev and can be fixed when somebody will need really large one. 338 * Keep in mind that it will be fed through icache hash function too. 339 */ 340static inline unsigned long hash(dev_t dev) 341{ 342 return MAJOR(dev)+MINOR(dev); 343} 344 345static int bdev_test(struct inode *inode, void *data) 346{ 347 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 348} 349 350static int bdev_set(struct inode *inode, void *data) 351{ 352 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 353 return 0; 354} 355 356static LIST_HEAD(all_bdevs); 357 358struct block_device *bdget(dev_t dev) 359{ 360 struct block_device *bdev; 361 struct inode *inode; 362 363 inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), 364 bdev_test, bdev_set, &dev); 365 366 if (!inode) 367 return NULL; 368 369 bdev = &BDEV_I(inode)->bdev; 370 371 if (inode->i_state & I_NEW) { 372 bdev->bd_contains = NULL; 373 bdev->bd_inode = inode; 374 bdev->bd_block_size = (1 << inode->i_blkbits); 375 bdev->bd_part_count = 0; 376 bdev->bd_invalidated = 0; 377 inode->i_mode = S_IFBLK; 378 inode->i_rdev = dev; 379 inode->i_bdev = bdev; 380 inode->i_data.a_ops = &def_blk_aops; 381 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 382 inode->i_data.backing_dev_info = &default_backing_dev_info; 383 spin_lock(&bdev_lock); 384 list_add(&bdev->bd_list, &all_bdevs); 385 spin_unlock(&bdev_lock); 386 unlock_new_inode(inode); 387 } 388 return bdev; 389} 390 391EXPORT_SYMBOL(bdget); 392 393long nr_blockdev_pages(void) 394{ 395 struct list_head *p; 396 long ret = 0; 397 spin_lock(&bdev_lock); 398 list_for_each(p, &all_bdevs) { 399 struct block_device *bdev; 400 bdev = list_entry(p, struct block_device, bd_list); 401 ret += bdev->bd_inode->i_mapping->nrpages; 402 } 403 spin_unlock(&bdev_lock); 404 return ret; 405} 406 407void bdput(struct block_device *bdev) 408{ 409 iput(bdev->bd_inode); 410} 411 412EXPORT_SYMBOL(bdput); 413 414static struct block_device *bd_acquire(struct inode *inode) 415{ 416 struct block_device *bdev; 417 418 spin_lock(&bdev_lock); 419 bdev = inode->i_bdev; 420 if (bdev) { 421 atomic_inc(&bdev->bd_inode->i_count); 422 spin_unlock(&bdev_lock); 423 return bdev; 424 } 425 spin_unlock(&bdev_lock); 426 427 bdev = bdget(inode->i_rdev); 428 if (bdev) { 429 spin_lock(&bdev_lock); 430 if (!inode->i_bdev) { 431 /* 432 * We take an additional bd_inode->i_count for inode, 433 * and it's released in clear_inode() of inode. 434 * So, we can access it via ->i_mapping always 435 * without igrab(). 436 */ 437 atomic_inc(&bdev->bd_inode->i_count); 438 inode->i_bdev = bdev; 439 inode->i_mapping = bdev->bd_inode->i_mapping; 440 list_add(&inode->i_devices, &bdev->bd_inodes); 441 } 442 spin_unlock(&bdev_lock); 443 } 444 return bdev; 445} 446 447/* Call when you free inode */ 448 449void bd_forget(struct inode *inode) 450{ 451 struct block_device *bdev = NULL; 452 453 spin_lock(&bdev_lock); 454 if (inode->i_bdev) { 455 if (inode->i_sb != blockdev_superblock) 456 bdev = inode->i_bdev; 457 __bd_forget(inode); 458 } 459 spin_unlock(&bdev_lock); 460 461 if (bdev) 462 iput(bdev->bd_inode); 463} 464 465int bd_claim(struct block_device *bdev, void *holder) 466{ 467 int res; 468 spin_lock(&bdev_lock); 469 470 /* first decide result */ 471 if (bdev->bd_holder == holder) 472 res = 0; /* already a holder */ 473 else if (bdev->bd_holder != NULL) 474 res = -EBUSY; /* held by someone else */ 475 else if (bdev->bd_contains == bdev) 476 res = 0; /* is a whole device which isn't held */ 477 478 else if (bdev->bd_contains->bd_holder == bd_claim) 479 res = 0; /* is a partition of a device that is being partitioned */ 480 else if (bdev->bd_contains->bd_holder != NULL) 481 res = -EBUSY; /* is a partition of a held device */ 482 else 483 res = 0; /* is a partition of an un-held device */ 484 485 /* now impose change */ 486 if (res==0) { 487 /* note that for a whole device bd_holders 488 * will be incremented twice, and bd_holder will 489 * be set to bd_claim before being set to holder 490 */ 491 bdev->bd_contains->bd_holders ++; 492 bdev->bd_contains->bd_holder = bd_claim; 493 bdev->bd_holders++; 494 bdev->bd_holder = holder; 495 } 496 spin_unlock(&bdev_lock); 497 return res; 498} 499 500EXPORT_SYMBOL(bd_claim); 501 502void bd_release(struct block_device *bdev) 503{ 504 spin_lock(&bdev_lock); 505 if (!--bdev->bd_contains->bd_holders) 506 bdev->bd_contains->bd_holder = NULL; 507 if (!--bdev->bd_holders) 508 bdev->bd_holder = NULL; 509 spin_unlock(&bdev_lock); 510} 511 512EXPORT_SYMBOL(bd_release); 513 514#ifdef CONFIG_SYSFS 515/* 516 * Functions for bd_claim_by_kobject / bd_release_from_kobject 517 * 518 * If a kobject is passed to bd_claim_by_kobject() 519 * and the kobject has a parent directory, 520 * following symlinks are created: 521 * o from the kobject to the claimed bdev 522 * o from "holders" directory of the bdev to the parent of the kobject 523 * bd_release_from_kobject() removes these symlinks. 524 * 525 * Example: 526 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 527 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 528 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 529 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 530 */ 531 532static struct kobject *bdev_get_kobj(struct block_device *bdev) 533{ 534 if (bdev->bd_contains != bdev) 535 return kobject_get(&bdev->bd_part->kobj); 536 else 537 return kobject_get(&bdev->bd_disk->kobj); 538} 539 540static struct kobject *bdev_get_holder(struct block_device *bdev) 541{ 542 if (bdev->bd_contains != bdev) 543 return kobject_get(bdev->bd_part->holder_dir); 544 else 545 return kobject_get(bdev->bd_disk->holder_dir); 546} 547 548static int add_symlink(struct kobject *from, struct kobject *to) 549{ 550 if (!from || !to) 551 return 0; 552 return sysfs_create_link(from, to, kobject_name(to)); 553} 554 555static void del_symlink(struct kobject *from, struct kobject *to) 556{ 557 if (!from || !to) 558 return; 559 sysfs_remove_link(from, kobject_name(to)); 560} 561 562/* 563 * 'struct bd_holder' contains pointers to kobjects symlinked by 564 * bd_claim_by_kobject. 565 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 566 */ 567struct bd_holder { 568 struct list_head list; /* chain of holders of the bdev */ 569 int count; /* references from the holder */ 570 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 571 struct kobject *hdev; /* e.g. "/block/dm-0" */ 572 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 573 struct kobject *sdev; /* e.g. "/block/sda" */ 574}; 575 576/* 577 * Get references of related kobjects at once. 578 * Returns 1 on success. 0 on failure. 579 * 580 * Should call bd_holder_release_dirs() after successful use. 581 */ 582static int bd_holder_grab_dirs(struct block_device *bdev, 583 struct bd_holder *bo) 584{ 585 if (!bdev || !bo) 586 return 0; 587 588 bo->sdir = kobject_get(bo->sdir); 589 if (!bo->sdir) 590 return 0; 591 592 bo->hdev = kobject_get(bo->sdir->parent); 593 if (!bo->hdev) 594 goto fail_put_sdir; 595 596 bo->sdev = bdev_get_kobj(bdev); 597 if (!bo->sdev) 598 goto fail_put_hdev; 599 600 bo->hdir = bdev_get_holder(bdev); 601 if (!bo->hdir) 602 goto fail_put_sdev; 603 604 return 1; 605 606fail_put_sdev: 607 kobject_put(bo->sdev); 608fail_put_hdev: 609 kobject_put(bo->hdev); 610fail_put_sdir: 611 kobject_put(bo->sdir); 612 613 return 0; 614} 615 616/* Put references of related kobjects at once. */ 617static void bd_holder_release_dirs(struct bd_holder *bo) 618{ 619 kobject_put(bo->hdir); 620 kobject_put(bo->sdev); 621 kobject_put(bo->hdev); 622 kobject_put(bo->sdir); 623} 624 625static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 626{ 627 struct bd_holder *bo; 628 629 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 630 if (!bo) 631 return NULL; 632 633 bo->count = 1; 634 bo->sdir = kobj; 635 636 return bo; 637} 638 639static void free_bd_holder(struct bd_holder *bo) 640{ 641 kfree(bo); 642} 643 644/** 645 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 646 * 647 * @bdev: block device to be bd_claimed 648 * @bo: preallocated and initialized by alloc_bd_holder() 649 * 650 * If there is no matching entry with @bo in @bdev->bd_holder_list, 651 * add @bo to the list, create symlinks. 652 * 653 * Returns 0 if symlinks are created or already there. 654 * Returns -ve if something fails and @bo can be freed. 655 */ 656static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 657{ 658 struct bd_holder *tmp; 659 int ret; 660 661 if (!bo) 662 return -EINVAL; 663 664 list_for_each_entry(tmp, &bdev->bd_holder_list, list) { 665 if (tmp->sdir == bo->sdir) { 666 tmp->count++; 667 /* We've already done what we need to do here. */ 668 free_bd_holder(bo); 669 return 0; 670 } 671 } 672 673 if (!bd_holder_grab_dirs(bdev, bo)) 674 return -EBUSY; 675 676 ret = add_symlink(bo->sdir, bo->sdev); 677 if (ret == 0) { 678 ret = add_symlink(bo->hdir, bo->hdev); 679 if (ret) 680 del_symlink(bo->sdir, bo->sdev); 681 } 682 if (ret == 0) 683 list_add_tail(&bo->list, &bdev->bd_holder_list); 684 return ret; 685} 686 687/** 688 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 689 * 690 * @bdev: block device to be bd_claimed 691 * @kobj: holder's kobject 692 * 693 * If there is matching entry with @kobj in @bdev->bd_holder_list 694 * and no other bd_claim() from the same kobject, 695 * remove the struct bd_holder from the list, delete symlinks for it. 696 * 697 * Returns a pointer to the struct bd_holder when it's removed from the list 698 * and ready to be freed. 699 * Returns NULL if matching claim isn't found or there is other bd_claim() 700 * by the same kobject. 701 */ 702static struct bd_holder *del_bd_holder(struct block_device *bdev, 703 struct kobject *kobj) 704{ 705 struct bd_holder *bo; 706 707 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 708 if (bo->sdir == kobj) { 709 bo->count--; 710 BUG_ON(bo->count < 0); 711 if (!bo->count) { 712 list_del(&bo->list); 713 del_symlink(bo->sdir, bo->sdev); 714 del_symlink(bo->hdir, bo->hdev); 715 bd_holder_release_dirs(bo); 716 return bo; 717 } 718 break; 719 } 720 } 721 722 return NULL; 723} 724 725/** 726 * bd_claim_by_kobject - bd_claim() with additional kobject signature 727 * 728 * @bdev: block device to be claimed 729 * @holder: holder's signature 730 * @kobj: holder's kobject 731 * 732 * Do bd_claim() and if it succeeds, create sysfs symlinks between 733 * the bdev and the holder's kobject. 734 * Use bd_release_from_kobject() when relesing the claimed bdev. 735 * 736 * Returns 0 on success. (same as bd_claim()) 737 * Returns errno on failure. 738 */ 739static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 740 struct kobject *kobj) 741{ 742 int res; 743 struct bd_holder *bo; 744 745 if (!kobj) 746 return -EINVAL; 747 748 bo = alloc_bd_holder(kobj); 749 if (!bo) 750 return -ENOMEM; 751 752 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 753 res = bd_claim(bdev, holder); 754 if (res == 0) 755 res = add_bd_holder(bdev, bo); 756 if (res) 757 free_bd_holder(bo); 758 mutex_unlock(&bdev->bd_mutex); 759 760 return res; 761} 762 763/** 764 * bd_release_from_kobject - bd_release() with additional kobject signature 765 * 766 * @bdev: block device to be released 767 * @kobj: holder's kobject 768 * 769 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 770 */ 771static void bd_release_from_kobject(struct block_device *bdev, 772 struct kobject *kobj) 773{ 774 struct bd_holder *bo; 775 776 if (!kobj) 777 return; 778 779 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 780 bd_release(bdev); 781 if ((bo = del_bd_holder(bdev, kobj))) 782 free_bd_holder(bo); 783 mutex_unlock(&bdev->bd_mutex); 784} 785 786/** 787 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 788 * 789 * @bdev: block device to be claimed 790 * @holder: holder's signature 791 * @disk: holder's gendisk 792 * 793 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 794 */ 795int bd_claim_by_disk(struct block_device *bdev, void *holder, 796 struct gendisk *disk) 797{ 798 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 799} 800EXPORT_SYMBOL_GPL(bd_claim_by_disk); 801 802/** 803 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 804 * 805 * @bdev: block device to be claimed 806 * @disk: holder's gendisk 807 * 808 * Call bd_release_from_kobject() and put @disk->slave_dir. 809 */ 810void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 811{ 812 bd_release_from_kobject(bdev, disk->slave_dir); 813 kobject_put(disk->slave_dir); 814} 815EXPORT_SYMBOL_GPL(bd_release_from_disk); 816#endif 817 818/* 819 * Tries to open block device by device number. Use it ONLY if you 820 * really do not have anything better - i.e. when you are behind a 821 * truly sucky interface and all you are given is a device number. _Never_ 822 * to be used for internal purposes. If you ever need it - reconsider 823 * your API. 824 */ 825struct block_device *open_by_devnum(dev_t dev, unsigned mode) 826{ 827 struct block_device *bdev = bdget(dev); 828 int err = -ENOMEM; 829 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 830 if (bdev) 831 err = blkdev_get(bdev, mode, flags); 832 return err ? ERR_PTR(err) : bdev; 833} 834 835EXPORT_SYMBOL(open_by_devnum); 836 837static int 838blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags); 839 840struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode) 841{ 842 struct block_device *bdev = bdget(dev); 843 int err = -ENOMEM; 844 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 845 if (bdev) 846 err = blkdev_get_partition(bdev, mode, flags); 847 return err ? ERR_PTR(err) : bdev; 848} 849 850EXPORT_SYMBOL(open_partition_by_devnum); 851 852 853/* 854 * This routine checks whether a removable media has been changed, 855 * and invalidates all buffer-cache-entries in that case. This 856 * is a relatively slow routine, so we have to try to minimize using 857 * it. Thus it is called only upon a 'mount' or 'open'. This 858 * is the best way of combining speed and utility, I think. 859 * People changing diskettes in the middle of an operation deserve 860 * to lose :-) 861 */ 862int check_disk_change(struct block_device *bdev) 863{ 864 struct gendisk *disk = bdev->bd_disk; 865 struct block_device_operations * bdops = disk->fops; 866 867 if (!bdops->media_changed) 868 return 0; 869 if (!bdops->media_changed(bdev->bd_disk)) 870 return 0; 871 872 if (__invalidate_device(bdev)) 873 printk("VFS: busy inodes on changed media.\n"); 874 875 if (bdops->revalidate_disk) 876 bdops->revalidate_disk(bdev->bd_disk); 877 if (bdev->bd_disk->minors > 1) 878 bdev->bd_invalidated = 1; 879 return 1; 880} 881 882EXPORT_SYMBOL(check_disk_change); 883 884void bd_set_size(struct block_device *bdev, loff_t size) 885{ 886 unsigned bsize = bdev_hardsect_size(bdev); 887 888 bdev->bd_inode->i_size = size; 889 while (bsize < PAGE_CACHE_SIZE) { 890 if (size & bsize) 891 break; 892 bsize <<= 1; 893 } 894 bdev->bd_block_size = bsize; 895 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 896} 897EXPORT_SYMBOL(bd_set_size); 898 899static int __blkdev_put(struct block_device *bdev, unsigned int subclass) 900{ 901 int ret = 0; 902 struct inode *bd_inode = bdev->bd_inode; 903 struct gendisk *disk = bdev->bd_disk; 904 905 mutex_lock_nested(&bdev->bd_mutex, subclass); 906 lock_kernel(); 907 if (!--bdev->bd_openers) { 908 sync_blockdev(bdev); 909 kill_bdev(bdev); 910 } 911 if (bdev->bd_contains == bdev) { 912 if (disk->fops->release) 913 ret = disk->fops->release(bd_inode, NULL); 914 } else { 915 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 916 subclass + 1); 917 bdev->bd_contains->bd_part_count--; 918 mutex_unlock(&bdev->bd_contains->bd_mutex); 919 } 920 if (!bdev->bd_openers) { 921 struct module *owner = disk->fops->owner; 922 923 put_disk(disk); 924 module_put(owner); 925 926 if (bdev->bd_contains != bdev) { 927 kobject_put(&bdev->bd_part->kobj); 928 bdev->bd_part = NULL; 929 } 930 bdev->bd_disk = NULL; 931 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 932 if (bdev != bdev->bd_contains) 933 __blkdev_put(bdev->bd_contains, subclass + 1); 934 bdev->bd_contains = NULL; 935 } 936 unlock_kernel(); 937 mutex_unlock(&bdev->bd_mutex); 938 bdput(bdev); 939 return ret; 940} 941 942int blkdev_put(struct block_device *bdev) 943{ 944 return __blkdev_put(bdev, BD_MUTEX_NORMAL); 945} 946EXPORT_SYMBOL(blkdev_put); 947 948int blkdev_put_partition(struct block_device *bdev) 949{ 950 return __blkdev_put(bdev, BD_MUTEX_PARTITION); 951} 952EXPORT_SYMBOL(blkdev_put_partition); 953 954static int 955blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); 956 957static int 958do_open(struct block_device *bdev, struct file *file, unsigned int subclass) 959{ 960 struct module *owner = NULL; 961 struct gendisk *disk; 962 int ret = -ENXIO; 963 int part; 964 965 file->f_mapping = bdev->bd_inode->i_mapping; 966 lock_kernel(); 967 disk = get_gendisk(bdev->bd_dev, &part); 968 if (!disk) { 969 unlock_kernel(); 970 bdput(bdev); 971 return ret; 972 } 973 owner = disk->fops->owner; 974 975 mutex_lock_nested(&bdev->bd_mutex, subclass); 976 977 if (!bdev->bd_openers) { 978 bdev->bd_disk = disk; 979 bdev->bd_contains = bdev; 980 if (!part) { 981 struct backing_dev_info *bdi; 982 if (disk->fops->open) { 983 ret = disk->fops->open(bdev->bd_inode, file); 984 if (ret) 985 goto out_first; 986 } 987 if (!bdev->bd_openers) { 988 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 989 bdi = blk_get_backing_dev_info(bdev); 990 if (bdi == NULL) 991 bdi = &default_backing_dev_info; 992 bdev->bd_inode->i_data.backing_dev_info = bdi; 993 } 994 if (bdev->bd_invalidated) 995 rescan_partitions(disk, bdev); 996 } else { 997 struct hd_struct *p; 998 struct block_device *whole; 999 whole = bdget_disk(disk, 0); 1000 ret = -ENOMEM; 1001 if (!whole) 1002 goto out_first; 1003 ret = blkdev_get_whole(whole, file->f_mode, file->f_flags); 1004 if (ret) 1005 goto out_first; 1006 bdev->bd_contains = whole; 1007 mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE); 1008 whole->bd_part_count++; 1009 p = disk->part[part - 1]; 1010 bdev->bd_inode->i_data.backing_dev_info = 1011 whole->bd_inode->i_data.backing_dev_info; 1012 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1013 whole->bd_part_count--; 1014 mutex_unlock(&whole->bd_mutex); 1015 ret = -ENXIO; 1016 goto out_first; 1017 } 1018 kobject_get(&p->kobj); 1019 bdev->bd_part = p; 1020 bd_set_size(bdev, (loff_t) p->nr_sects << 9); 1021 mutex_unlock(&whole->bd_mutex); 1022 } 1023 } else { 1024 put_disk(disk); 1025 module_put(owner); 1026 if (bdev->bd_contains == bdev) { 1027 if (bdev->bd_disk->fops->open) { 1028 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1029 if (ret) 1030 goto out; 1031 } 1032 if (bdev->bd_invalidated) 1033 rescan_partitions(bdev->bd_disk, bdev); 1034 } else { 1035 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 1036 BD_MUTEX_WHOLE); 1037 bdev->bd_contains->bd_part_count++; 1038 mutex_unlock(&bdev->bd_contains->bd_mutex); 1039 } 1040 } 1041 bdev->bd_openers++; 1042 mutex_unlock(&bdev->bd_mutex); 1043 unlock_kernel(); 1044 return 0; 1045 1046out_first: 1047 bdev->bd_disk = NULL; 1048 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1049 if (bdev != bdev->bd_contains) 1050 __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); 1051 bdev->bd_contains = NULL; 1052 put_disk(disk); 1053 module_put(owner); 1054out: 1055 mutex_unlock(&bdev->bd_mutex); 1056 unlock_kernel(); 1057 if (ret) 1058 bdput(bdev); 1059 return ret; 1060} 1061 1062int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) 1063{ 1064 /* 1065 * This crockload is due to bad choice of ->open() type. 1066 * It will go away. 1067 * For now, block device ->open() routine must _not_ 1068 * examine anything in 'inode' argument except ->i_rdev. 1069 */ 1070 struct file fake_file = {}; 1071 struct dentry fake_dentry = {}; 1072 fake_file.f_mode = mode; 1073 fake_file.f_flags = flags; 1074 fake_file.f_dentry = &fake_dentry; 1075 fake_dentry.d_inode = bdev->bd_inode; 1076 1077 return do_open(bdev, &fake_file, BD_MUTEX_NORMAL); 1078} 1079 1080EXPORT_SYMBOL(blkdev_get); 1081 1082static int 1083blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags) 1084{ 1085 /* 1086 * This crockload is due to bad choice of ->open() type. 1087 * It will go away. 1088 * For now, block device ->open() routine must _not_ 1089 * examine anything in 'inode' argument except ->i_rdev. 1090 */ 1091 struct file fake_file = {}; 1092 struct dentry fake_dentry = {}; 1093 fake_file.f_mode = mode; 1094 fake_file.f_flags = flags; 1095 fake_file.f_dentry = &fake_dentry; 1096 fake_dentry.d_inode = bdev->bd_inode; 1097 1098 return do_open(bdev, &fake_file, BD_MUTEX_WHOLE); 1099} 1100 1101static int 1102blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags) 1103{ 1104 /* 1105 * This crockload is due to bad choice of ->open() type. 1106 * It will go away. 1107 * For now, block device ->open() routine must _not_ 1108 * examine anything in 'inode' argument except ->i_rdev. 1109 */ 1110 struct file fake_file = {}; 1111 struct dentry fake_dentry = {}; 1112 fake_file.f_mode = mode; 1113 fake_file.f_flags = flags; 1114 fake_file.f_dentry = &fake_dentry; 1115 fake_dentry.d_inode = bdev->bd_inode; 1116 1117 return do_open(bdev, &fake_file, BD_MUTEX_PARTITION); 1118} 1119 1120static int blkdev_open(struct inode * inode, struct file * filp) 1121{ 1122 struct block_device *bdev; 1123 int res; 1124 1125 /* 1126 * Preserve backwards compatibility and allow large file access 1127 * even if userspace doesn't ask for it explicitly. Some mkfs 1128 * binary needs it. We might want to drop this workaround 1129 * during an unstable branch. 1130 */ 1131 filp->f_flags |= O_LARGEFILE; 1132 1133 bdev = bd_acquire(inode); 1134 1135 res = do_open(bdev, filp, BD_MUTEX_NORMAL); 1136 if (res) 1137 return res; 1138 1139 if (!(filp->f_flags & O_EXCL) ) 1140 return 0; 1141 1142 if (!(res = bd_claim(bdev, filp))) 1143 return 0; 1144 1145 blkdev_put(bdev); 1146 return res; 1147} 1148 1149static int blkdev_close(struct inode * inode, struct file * filp) 1150{ 1151 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1152 if (bdev->bd_holder == filp) 1153 bd_release(bdev); 1154 return blkdev_put(bdev); 1155} 1156 1157static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1158{ 1159 return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); 1160} 1161 1162const struct address_space_operations def_blk_aops = { 1163 .readpage = blkdev_readpage, 1164 .writepage = blkdev_writepage, 1165 .sync_page = block_sync_page, 1166 .prepare_write = blkdev_prepare_write, 1167 .commit_write = blkdev_commit_write, 1168 .writepages = generic_writepages, 1169 .direct_IO = blkdev_direct_IO, 1170}; 1171 1172const struct file_operations def_blk_fops = { 1173 .open = blkdev_open, 1174 .release = blkdev_close, 1175 .llseek = block_llseek, 1176 .read = do_sync_read, 1177 .write = do_sync_write, 1178 .aio_read = generic_file_aio_read, 1179 .aio_write = generic_file_aio_write_nolock, 1180 .mmap = generic_file_mmap, 1181 .fsync = block_fsync, 1182 .unlocked_ioctl = block_ioctl, 1183#ifdef CONFIG_COMPAT 1184 .compat_ioctl = compat_blkdev_ioctl, 1185#endif 1186 .sendfile = generic_file_sendfile, 1187 .splice_read = generic_file_splice_read, 1188 .splice_write = generic_file_splice_write, 1189}; 1190 1191int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1192{ 1193 int res; 1194 mm_segment_t old_fs = get_fs(); 1195 set_fs(KERNEL_DS); 1196 res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg); 1197 set_fs(old_fs); 1198 return res; 1199} 1200 1201EXPORT_SYMBOL(ioctl_by_bdev); 1202 1203/** 1204 * lookup_bdev - lookup a struct block_device by name 1205 * 1206 * @path: special file representing the block device 1207 * 1208 * Get a reference to the blockdevice at @path in the current 1209 * namespace if possible and return it. Return ERR_PTR(error) 1210 * otherwise. 1211 */ 1212struct block_device *lookup_bdev(const char *path) 1213{ 1214 struct block_device *bdev; 1215 struct inode *inode; 1216 struct nameidata nd; 1217 int error; 1218 1219 if (!path || !*path) 1220 return ERR_PTR(-EINVAL); 1221 1222 error = path_lookup(path, LOOKUP_FOLLOW, &nd); 1223 if (error) 1224 return ERR_PTR(error); 1225 1226 inode = nd.dentry->d_inode; 1227 error = -ENOTBLK; 1228 if (!S_ISBLK(inode->i_mode)) 1229 goto fail; 1230 error = -EACCES; 1231 if (nd.mnt->mnt_flags & MNT_NODEV) 1232 goto fail; 1233 error = -ENOMEM; 1234 bdev = bd_acquire(inode); 1235 if (!bdev) 1236 goto fail; 1237out: 1238 path_release(&nd); 1239 return bdev; 1240fail: 1241 bdev = ERR_PTR(error); 1242 goto out; 1243} 1244 1245/** 1246 * open_bdev_excl - open a block device by name and set it up for use 1247 * 1248 * @path: special file representing the block device 1249 * @flags: %MS_RDONLY for opening read-only 1250 * @holder: owner for exclusion 1251 * 1252 * Open the blockdevice described by the special file at @path, claim it 1253 * for the @holder. 1254 */ 1255struct block_device *open_bdev_excl(const char *path, int flags, void *holder) 1256{ 1257 struct block_device *bdev; 1258 mode_t mode = FMODE_READ; 1259 int error = 0; 1260 1261 bdev = lookup_bdev(path); 1262 if (IS_ERR(bdev)) 1263 return bdev; 1264 1265 if (!(flags & MS_RDONLY)) 1266 mode |= FMODE_WRITE; 1267 error = blkdev_get(bdev, mode, 0); 1268 if (error) 1269 return ERR_PTR(error); 1270 error = -EACCES; 1271 if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) 1272 goto blkdev_put; 1273 error = bd_claim(bdev, holder); 1274 if (error) 1275 goto blkdev_put; 1276 1277 return bdev; 1278 1279blkdev_put: 1280 blkdev_put(bdev); 1281 return ERR_PTR(error); 1282} 1283 1284EXPORT_SYMBOL(open_bdev_excl); 1285 1286/** 1287 * close_bdev_excl - release a blockdevice openen by open_bdev_excl() 1288 * 1289 * @bdev: blockdevice to close 1290 * 1291 * This is the counterpart to open_bdev_excl(). 1292 */ 1293void close_bdev_excl(struct block_device *bdev) 1294{ 1295 bd_release(bdev); 1296 blkdev_put(bdev); 1297} 1298 1299EXPORT_SYMBOL(close_bdev_excl); 1300 1301int __invalidate_device(struct block_device *bdev) 1302{ 1303 struct super_block *sb = get_super(bdev); 1304 int res = 0; 1305 1306 if (sb) { 1307 /* 1308 * no need to lock the super, get_super holds the 1309 * read mutex so the filesystem cannot go away 1310 * under us (->put_super runs with the write lock 1311 * hold). 1312 */ 1313 shrink_dcache_sb(sb); 1314 res = invalidate_inodes(sb); 1315 drop_super(sb); 1316 } 1317 invalidate_bdev(bdev, 0); 1318 return res; 1319} 1320EXPORT_SYMBOL(__invalidate_device);