at v2.6.19 33 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/smp_lock.h> 15#include <linux/highmem.h> 16#include <linux/blkdev.h> 17#include <linux/module.h> 18#include <linux/blkpg.h> 19#include <linux/buffer_head.h> 20#include <linux/writeback.h> 21#include <linux/mpage.h> 22#include <linux/mount.h> 23#include <linux/uio.h> 24#include <linux/namei.h> 25#include <asm/uaccess.h> 26#include "internal.h" 27 28struct bdev_inode { 29 struct block_device bdev; 30 struct inode vfs_inode; 31}; 32 33static inline struct bdev_inode *BDEV_I(struct inode *inode) 34{ 35 return container_of(inode, struct bdev_inode, vfs_inode); 36} 37 38inline struct block_device *I_BDEV(struct inode *inode) 39{ 40 return &BDEV_I(inode)->bdev; 41} 42 43EXPORT_SYMBOL(I_BDEV); 44 45static sector_t max_block(struct block_device *bdev) 46{ 47 sector_t retval = ~((sector_t)0); 48 loff_t sz = i_size_read(bdev->bd_inode); 49 50 if (sz) { 51 unsigned int size = block_size(bdev); 52 unsigned int sizebits = blksize_bits(size); 53 retval = (sz >> sizebits); 54 } 55 return retval; 56} 57 58/* Kill _all_ buffers, dirty or not.. */ 59static void kill_bdev(struct block_device *bdev) 60{ 61 invalidate_bdev(bdev, 1); 62 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 63} 64 65int set_blocksize(struct block_device *bdev, int size) 66{ 67 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 68 if (size > PAGE_SIZE || size < 512 || (size & (size-1))) 69 return -EINVAL; 70 71 /* Size cannot be smaller than the size supported by the device */ 72 if (size < bdev_hardsect_size(bdev)) 73 return -EINVAL; 74 75 /* Don't change the size if it is same as current */ 76 if (bdev->bd_block_size != size) { 77 sync_blockdev(bdev); 78 bdev->bd_block_size = size; 79 bdev->bd_inode->i_blkbits = blksize_bits(size); 80 kill_bdev(bdev); 81 } 82 return 0; 83} 84 85EXPORT_SYMBOL(set_blocksize); 86 87int sb_set_blocksize(struct super_block *sb, int size) 88{ 89 if (set_blocksize(sb->s_bdev, size)) 90 return 0; 91 /* If we get here, we know size is power of two 92 * and it's value is between 512 and PAGE_SIZE */ 93 sb->s_blocksize = size; 94 sb->s_blocksize_bits = blksize_bits(size); 95 return sb->s_blocksize; 96} 97 98EXPORT_SYMBOL(sb_set_blocksize); 99 100int sb_min_blocksize(struct super_block *sb, int size) 101{ 102 int minsize = bdev_hardsect_size(sb->s_bdev); 103 if (size < minsize) 104 size = minsize; 105 return sb_set_blocksize(sb, size); 106} 107 108EXPORT_SYMBOL(sb_min_blocksize); 109 110static int 111blkdev_get_block(struct inode *inode, sector_t iblock, 112 struct buffer_head *bh, int create) 113{ 114 if (iblock >= max_block(I_BDEV(inode))) { 115 if (create) 116 return -EIO; 117 118 /* 119 * for reads, we're just trying to fill a partial page. 120 * return a hole, they will have to call get_block again 121 * before they can fill it, and they will get -EIO at that 122 * time 123 */ 124 return 0; 125 } 126 bh->b_bdev = I_BDEV(inode); 127 bh->b_blocknr = iblock; 128 set_buffer_mapped(bh); 129 return 0; 130} 131 132static int 133blkdev_get_blocks(struct inode *inode, sector_t iblock, 134 struct buffer_head *bh, int create) 135{ 136 sector_t end_block = max_block(I_BDEV(inode)); 137 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 138 139 if ((iblock + max_blocks) > end_block) { 140 max_blocks = end_block - iblock; 141 if ((long)max_blocks <= 0) { 142 if (create) 143 return -EIO; /* write fully beyond EOF */ 144 /* 145 * It is a read which is fully beyond EOF. We return 146 * a !buffer_mapped buffer 147 */ 148 max_blocks = 0; 149 } 150 } 151 152 bh->b_bdev = I_BDEV(inode); 153 bh->b_blocknr = iblock; 154 bh->b_size = max_blocks << inode->i_blkbits; 155 if (max_blocks) 156 set_buffer_mapped(bh); 157 return 0; 158} 159 160static ssize_t 161blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 162 loff_t offset, unsigned long nr_segs) 163{ 164 struct file *file = iocb->ki_filp; 165 struct inode *inode = file->f_mapping->host; 166 167 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 168 iov, offset, nr_segs, blkdev_get_blocks, NULL); 169} 170 171static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 172{ 173 return block_write_full_page(page, blkdev_get_block, wbc); 174} 175 176static int blkdev_readpage(struct file * file, struct page * page) 177{ 178 return block_read_full_page(page, blkdev_get_block); 179} 180 181static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) 182{ 183 return block_prepare_write(page, from, to, blkdev_get_block); 184} 185 186static int blkdev_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) 187{ 188 return block_commit_write(page, from, to); 189} 190 191/* 192 * private llseek: 193 * for a block special file file->f_dentry->d_inode->i_size is zero 194 * so we compute the size by hand (just as in block_read/write above) 195 */ 196static loff_t block_llseek(struct file *file, loff_t offset, int origin) 197{ 198 struct inode *bd_inode = file->f_mapping->host; 199 loff_t size; 200 loff_t retval; 201 202 mutex_lock(&bd_inode->i_mutex); 203 size = i_size_read(bd_inode); 204 205 switch (origin) { 206 case 2: 207 offset += size; 208 break; 209 case 1: 210 offset += file->f_pos; 211 } 212 retval = -EINVAL; 213 if (offset >= 0 && offset <= size) { 214 if (offset != file->f_pos) { 215 file->f_pos = offset; 216 } 217 retval = offset; 218 } 219 mutex_unlock(&bd_inode->i_mutex); 220 return retval; 221} 222 223/* 224 * Filp is never NULL; the only case when ->fsync() is called with 225 * NULL first argument is nfsd_sync_dir() and that's not a directory. 226 */ 227 228static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 229{ 230 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 231} 232 233/* 234 * pseudo-fs 235 */ 236 237static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 238static kmem_cache_t * bdev_cachep __read_mostly; 239 240static struct inode *bdev_alloc_inode(struct super_block *sb) 241{ 242 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL); 243 if (!ei) 244 return NULL; 245 return &ei->vfs_inode; 246} 247 248static void bdev_destroy_inode(struct inode *inode) 249{ 250 struct bdev_inode *bdi = BDEV_I(inode); 251 252 bdi->bdev.bd_inode_backing_dev_info = NULL; 253 kmem_cache_free(bdev_cachep, bdi); 254} 255 256static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) 257{ 258 struct bdev_inode *ei = (struct bdev_inode *) foo; 259 struct block_device *bdev = &ei->bdev; 260 261 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 262 SLAB_CTOR_CONSTRUCTOR) 263 { 264 memset(bdev, 0, sizeof(*bdev)); 265 mutex_init(&bdev->bd_mutex); 266 mutex_init(&bdev->bd_mount_mutex); 267 INIT_LIST_HEAD(&bdev->bd_inodes); 268 INIT_LIST_HEAD(&bdev->bd_list); 269#ifdef CONFIG_SYSFS 270 INIT_LIST_HEAD(&bdev->bd_holder_list); 271#endif 272 inode_init_once(&ei->vfs_inode); 273 } 274} 275 276static inline void __bd_forget(struct inode *inode) 277{ 278 list_del_init(&inode->i_devices); 279 inode->i_bdev = NULL; 280 inode->i_mapping = &inode->i_data; 281} 282 283static void bdev_clear_inode(struct inode *inode) 284{ 285 struct block_device *bdev = &BDEV_I(inode)->bdev; 286 struct list_head *p; 287 spin_lock(&bdev_lock); 288 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 289 __bd_forget(list_entry(p, struct inode, i_devices)); 290 } 291 list_del_init(&bdev->bd_list); 292 spin_unlock(&bdev_lock); 293} 294 295static struct super_operations bdev_sops = { 296 .statfs = simple_statfs, 297 .alloc_inode = bdev_alloc_inode, 298 .destroy_inode = bdev_destroy_inode, 299 .drop_inode = generic_delete_inode, 300 .clear_inode = bdev_clear_inode, 301}; 302 303static int bd_get_sb(struct file_system_type *fs_type, 304 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 305{ 306 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 307} 308 309static struct file_system_type bd_type = { 310 .name = "bdev", 311 .get_sb = bd_get_sb, 312 .kill_sb = kill_anon_super, 313}; 314 315static struct vfsmount *bd_mnt __read_mostly; 316struct super_block *blockdev_superblock; 317 318void __init bdev_cache_init(void) 319{ 320 int err; 321 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 322 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 323 SLAB_MEM_SPREAD|SLAB_PANIC), 324 init_once, NULL); 325 err = register_filesystem(&bd_type); 326 if (err) 327 panic("Cannot register bdev pseudo-fs"); 328 bd_mnt = kern_mount(&bd_type); 329 err = PTR_ERR(bd_mnt); 330 if (IS_ERR(bd_mnt)) 331 panic("Cannot create bdev pseudo-fs"); 332 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 333} 334 335/* 336 * Most likely _very_ bad one - but then it's hardly critical for small 337 * /dev and can be fixed when somebody will need really large one. 338 * Keep in mind that it will be fed through icache hash function too. 339 */ 340static inline unsigned long hash(dev_t dev) 341{ 342 return MAJOR(dev)+MINOR(dev); 343} 344 345static int bdev_test(struct inode *inode, void *data) 346{ 347 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 348} 349 350static int bdev_set(struct inode *inode, void *data) 351{ 352 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 353 return 0; 354} 355 356static LIST_HEAD(all_bdevs); 357 358struct block_device *bdget(dev_t dev) 359{ 360 struct block_device *bdev; 361 struct inode *inode; 362 363 inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), 364 bdev_test, bdev_set, &dev); 365 366 if (!inode) 367 return NULL; 368 369 bdev = &BDEV_I(inode)->bdev; 370 371 if (inode->i_state & I_NEW) { 372 bdev->bd_contains = NULL; 373 bdev->bd_inode = inode; 374 bdev->bd_block_size = (1 << inode->i_blkbits); 375 bdev->bd_part_count = 0; 376 bdev->bd_invalidated = 0; 377 inode->i_mode = S_IFBLK; 378 inode->i_rdev = dev; 379 inode->i_bdev = bdev; 380 inode->i_data.a_ops = &def_blk_aops; 381 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 382 inode->i_data.backing_dev_info = &default_backing_dev_info; 383 spin_lock(&bdev_lock); 384 list_add(&bdev->bd_list, &all_bdevs); 385 spin_unlock(&bdev_lock); 386 unlock_new_inode(inode); 387 } 388 return bdev; 389} 390 391EXPORT_SYMBOL(bdget); 392 393long nr_blockdev_pages(void) 394{ 395 struct list_head *p; 396 long ret = 0; 397 spin_lock(&bdev_lock); 398 list_for_each(p, &all_bdevs) { 399 struct block_device *bdev; 400 bdev = list_entry(p, struct block_device, bd_list); 401 ret += bdev->bd_inode->i_mapping->nrpages; 402 } 403 spin_unlock(&bdev_lock); 404 return ret; 405} 406 407void bdput(struct block_device *bdev) 408{ 409 iput(bdev->bd_inode); 410} 411 412EXPORT_SYMBOL(bdput); 413 414static struct block_device *bd_acquire(struct inode *inode) 415{ 416 struct block_device *bdev; 417 418 spin_lock(&bdev_lock); 419 bdev = inode->i_bdev; 420 if (bdev) { 421 atomic_inc(&bdev->bd_inode->i_count); 422 spin_unlock(&bdev_lock); 423 return bdev; 424 } 425 spin_unlock(&bdev_lock); 426 427 bdev = bdget(inode->i_rdev); 428 if (bdev) { 429 spin_lock(&bdev_lock); 430 if (!inode->i_bdev) { 431 /* 432 * We take an additional bd_inode->i_count for inode, 433 * and it's released in clear_inode() of inode. 434 * So, we can access it via ->i_mapping always 435 * without igrab(). 436 */ 437 atomic_inc(&bdev->bd_inode->i_count); 438 inode->i_bdev = bdev; 439 inode->i_mapping = bdev->bd_inode->i_mapping; 440 list_add(&inode->i_devices, &bdev->bd_inodes); 441 } 442 spin_unlock(&bdev_lock); 443 } 444 return bdev; 445} 446 447/* Call when you free inode */ 448 449void bd_forget(struct inode *inode) 450{ 451 struct block_device *bdev = NULL; 452 453 spin_lock(&bdev_lock); 454 if (inode->i_bdev) { 455 if (inode->i_sb != blockdev_superblock) 456 bdev = inode->i_bdev; 457 __bd_forget(inode); 458 } 459 spin_unlock(&bdev_lock); 460 461 if (bdev) 462 iput(bdev->bd_inode); 463} 464 465int bd_claim(struct block_device *bdev, void *holder) 466{ 467 int res; 468 spin_lock(&bdev_lock); 469 470 /* first decide result */ 471 if (bdev->bd_holder == holder) 472 res = 0; /* already a holder */ 473 else if (bdev->bd_holder != NULL) 474 res = -EBUSY; /* held by someone else */ 475 else if (bdev->bd_contains == bdev) 476 res = 0; /* is a whole device which isn't held */ 477 478 else if (bdev->bd_contains->bd_holder == bd_claim) 479 res = 0; /* is a partition of a device that is being partitioned */ 480 else if (bdev->bd_contains->bd_holder != NULL) 481 res = -EBUSY; /* is a partition of a held device */ 482 else 483 res = 0; /* is a partition of an un-held device */ 484 485 /* now impose change */ 486 if (res==0) { 487 /* note that for a whole device bd_holders 488 * will be incremented twice, and bd_holder will 489 * be set to bd_claim before being set to holder 490 */ 491 bdev->bd_contains->bd_holders ++; 492 bdev->bd_contains->bd_holder = bd_claim; 493 bdev->bd_holders++; 494 bdev->bd_holder = holder; 495 } 496 spin_unlock(&bdev_lock); 497 return res; 498} 499 500EXPORT_SYMBOL(bd_claim); 501 502void bd_release(struct block_device *bdev) 503{ 504 spin_lock(&bdev_lock); 505 if (!--bdev->bd_contains->bd_holders) 506 bdev->bd_contains->bd_holder = NULL; 507 if (!--bdev->bd_holders) 508 bdev->bd_holder = NULL; 509 spin_unlock(&bdev_lock); 510} 511 512EXPORT_SYMBOL(bd_release); 513 514#ifdef CONFIG_SYSFS 515/* 516 * Functions for bd_claim_by_kobject / bd_release_from_kobject 517 * 518 * If a kobject is passed to bd_claim_by_kobject() 519 * and the kobject has a parent directory, 520 * following symlinks are created: 521 * o from the kobject to the claimed bdev 522 * o from "holders" directory of the bdev to the parent of the kobject 523 * bd_release_from_kobject() removes these symlinks. 524 * 525 * Example: 526 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 527 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 528 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 529 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 530 */ 531 532static struct kobject *bdev_get_kobj(struct block_device *bdev) 533{ 534 if (bdev->bd_contains != bdev) 535 return kobject_get(&bdev->bd_part->kobj); 536 else 537 return kobject_get(&bdev->bd_disk->kobj); 538} 539 540static struct kobject *bdev_get_holder(struct block_device *bdev) 541{ 542 if (bdev->bd_contains != bdev) 543 return kobject_get(bdev->bd_part->holder_dir); 544 else 545 return kobject_get(bdev->bd_disk->holder_dir); 546} 547 548static int add_symlink(struct kobject *from, struct kobject *to) 549{ 550 if (!from || !to) 551 return 0; 552 return sysfs_create_link(from, to, kobject_name(to)); 553} 554 555static void del_symlink(struct kobject *from, struct kobject *to) 556{ 557 if (!from || !to) 558 return; 559 sysfs_remove_link(from, kobject_name(to)); 560} 561 562/* 563 * 'struct bd_holder' contains pointers to kobjects symlinked by 564 * bd_claim_by_kobject. 565 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 566 */ 567struct bd_holder { 568 struct list_head list; /* chain of holders of the bdev */ 569 int count; /* references from the holder */ 570 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 571 struct kobject *hdev; /* e.g. "/block/dm-0" */ 572 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 573 struct kobject *sdev; /* e.g. "/block/sda" */ 574}; 575 576/* 577 * Get references of related kobjects at once. 578 * Returns 1 on success. 0 on failure. 579 * 580 * Should call bd_holder_release_dirs() after successful use. 581 */ 582static int bd_holder_grab_dirs(struct block_device *bdev, 583 struct bd_holder *bo) 584{ 585 if (!bdev || !bo) 586 return 0; 587 588 bo->sdir = kobject_get(bo->sdir); 589 if (!bo->sdir) 590 return 0; 591 592 bo->hdev = kobject_get(bo->sdir->parent); 593 if (!bo->hdev) 594 goto fail_put_sdir; 595 596 bo->sdev = bdev_get_kobj(bdev); 597 if (!bo->sdev) 598 goto fail_put_hdev; 599 600 bo->hdir = bdev_get_holder(bdev); 601 if (!bo->hdir) 602 goto fail_put_sdev; 603 604 return 1; 605 606fail_put_sdev: 607 kobject_put(bo->sdev); 608fail_put_hdev: 609 kobject_put(bo->hdev); 610fail_put_sdir: 611 kobject_put(bo->sdir); 612 613 return 0; 614} 615 616/* Put references of related kobjects at once. */ 617static void bd_holder_release_dirs(struct bd_holder *bo) 618{ 619 kobject_put(bo->hdir); 620 kobject_put(bo->sdev); 621 kobject_put(bo->hdev); 622 kobject_put(bo->sdir); 623} 624 625static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 626{ 627 struct bd_holder *bo; 628 629 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 630 if (!bo) 631 return NULL; 632 633 bo->count = 1; 634 bo->sdir = kobj; 635 636 return bo; 637} 638 639static void free_bd_holder(struct bd_holder *bo) 640{ 641 kfree(bo); 642} 643 644/** 645 * find_bd_holder - find matching struct bd_holder from the block device 646 * 647 * @bdev: struct block device to be searched 648 * @bo: target struct bd_holder 649 * 650 * Returns matching entry with @bo in @bdev->bd_holder_list. 651 * If found, increment the reference count and return the pointer. 652 * If not found, returns NULL. 653 */ 654static struct bd_holder *find_bd_holder(struct block_device *bdev, 655 struct bd_holder *bo) 656{ 657 struct bd_holder *tmp; 658 659 list_for_each_entry(tmp, &bdev->bd_holder_list, list) 660 if (tmp->sdir == bo->sdir) { 661 tmp->count++; 662 return tmp; 663 } 664 665 return NULL; 666} 667 668/** 669 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 670 * 671 * @bdev: block device to be bd_claimed 672 * @bo: preallocated and initialized by alloc_bd_holder() 673 * 674 * Add @bo to @bdev->bd_holder_list, create symlinks. 675 * 676 * Returns 0 if symlinks are created. 677 * Returns -ve if something fails. 678 */ 679static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 680{ 681 int ret; 682 683 if (!bo) 684 return -EINVAL; 685 686 if (!bd_holder_grab_dirs(bdev, bo)) 687 return -EBUSY; 688 689 ret = add_symlink(bo->sdir, bo->sdev); 690 if (ret == 0) { 691 ret = add_symlink(bo->hdir, bo->hdev); 692 if (ret) 693 del_symlink(bo->sdir, bo->sdev); 694 } 695 if (ret == 0) 696 list_add_tail(&bo->list, &bdev->bd_holder_list); 697 return ret; 698} 699 700/** 701 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 702 * 703 * @bdev: block device to be bd_claimed 704 * @kobj: holder's kobject 705 * 706 * If there is matching entry with @kobj in @bdev->bd_holder_list 707 * and no other bd_claim() from the same kobject, 708 * remove the struct bd_holder from the list, delete symlinks for it. 709 * 710 * Returns a pointer to the struct bd_holder when it's removed from the list 711 * and ready to be freed. 712 * Returns NULL if matching claim isn't found or there is other bd_claim() 713 * by the same kobject. 714 */ 715static struct bd_holder *del_bd_holder(struct block_device *bdev, 716 struct kobject *kobj) 717{ 718 struct bd_holder *bo; 719 720 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 721 if (bo->sdir == kobj) { 722 bo->count--; 723 BUG_ON(bo->count < 0); 724 if (!bo->count) { 725 list_del(&bo->list); 726 del_symlink(bo->sdir, bo->sdev); 727 del_symlink(bo->hdir, bo->hdev); 728 bd_holder_release_dirs(bo); 729 return bo; 730 } 731 break; 732 } 733 } 734 735 return NULL; 736} 737 738/** 739 * bd_claim_by_kobject - bd_claim() with additional kobject signature 740 * 741 * @bdev: block device to be claimed 742 * @holder: holder's signature 743 * @kobj: holder's kobject 744 * 745 * Do bd_claim() and if it succeeds, create sysfs symlinks between 746 * the bdev and the holder's kobject. 747 * Use bd_release_from_kobject() when relesing the claimed bdev. 748 * 749 * Returns 0 on success. (same as bd_claim()) 750 * Returns errno on failure. 751 */ 752static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 753 struct kobject *kobj) 754{ 755 int res; 756 struct bd_holder *bo, *found; 757 758 if (!kobj) 759 return -EINVAL; 760 761 bo = alloc_bd_holder(kobj); 762 if (!bo) 763 return -ENOMEM; 764 765 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 766 res = bd_claim(bdev, holder); 767 if (res == 0) { 768 found = find_bd_holder(bdev, bo); 769 if (found == NULL) { 770 res = add_bd_holder(bdev, bo); 771 if (res) 772 bd_release(bdev); 773 } 774 } 775 776 if (res || found) 777 free_bd_holder(bo); 778 mutex_unlock(&bdev->bd_mutex); 779 780 return res; 781} 782 783/** 784 * bd_release_from_kobject - bd_release() with additional kobject signature 785 * 786 * @bdev: block device to be released 787 * @kobj: holder's kobject 788 * 789 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 790 */ 791static void bd_release_from_kobject(struct block_device *bdev, 792 struct kobject *kobj) 793{ 794 struct bd_holder *bo; 795 796 if (!kobj) 797 return; 798 799 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 800 bd_release(bdev); 801 if ((bo = del_bd_holder(bdev, kobj))) 802 free_bd_holder(bo); 803 mutex_unlock(&bdev->bd_mutex); 804} 805 806/** 807 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 808 * 809 * @bdev: block device to be claimed 810 * @holder: holder's signature 811 * @disk: holder's gendisk 812 * 813 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 814 */ 815int bd_claim_by_disk(struct block_device *bdev, void *holder, 816 struct gendisk *disk) 817{ 818 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 819} 820EXPORT_SYMBOL_GPL(bd_claim_by_disk); 821 822/** 823 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 824 * 825 * @bdev: block device to be claimed 826 * @disk: holder's gendisk 827 * 828 * Call bd_release_from_kobject() and put @disk->slave_dir. 829 */ 830void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 831{ 832 bd_release_from_kobject(bdev, disk->slave_dir); 833 kobject_put(disk->slave_dir); 834} 835EXPORT_SYMBOL_GPL(bd_release_from_disk); 836#endif 837 838/* 839 * Tries to open block device by device number. Use it ONLY if you 840 * really do not have anything better - i.e. when you are behind a 841 * truly sucky interface and all you are given is a device number. _Never_ 842 * to be used for internal purposes. If you ever need it - reconsider 843 * your API. 844 */ 845struct block_device *open_by_devnum(dev_t dev, unsigned mode) 846{ 847 struct block_device *bdev = bdget(dev); 848 int err = -ENOMEM; 849 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 850 if (bdev) 851 err = blkdev_get(bdev, mode, flags); 852 return err ? ERR_PTR(err) : bdev; 853} 854 855EXPORT_SYMBOL(open_by_devnum); 856 857static int 858blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags); 859 860struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode) 861{ 862 struct block_device *bdev = bdget(dev); 863 int err = -ENOMEM; 864 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 865 if (bdev) 866 err = blkdev_get_partition(bdev, mode, flags); 867 return err ? ERR_PTR(err) : bdev; 868} 869 870EXPORT_SYMBOL(open_partition_by_devnum); 871 872 873/* 874 * This routine checks whether a removable media has been changed, 875 * and invalidates all buffer-cache-entries in that case. This 876 * is a relatively slow routine, so we have to try to minimize using 877 * it. Thus it is called only upon a 'mount' or 'open'. This 878 * is the best way of combining speed and utility, I think. 879 * People changing diskettes in the middle of an operation deserve 880 * to lose :-) 881 */ 882int check_disk_change(struct block_device *bdev) 883{ 884 struct gendisk *disk = bdev->bd_disk; 885 struct block_device_operations * bdops = disk->fops; 886 887 if (!bdops->media_changed) 888 return 0; 889 if (!bdops->media_changed(bdev->bd_disk)) 890 return 0; 891 892 if (__invalidate_device(bdev)) 893 printk("VFS: busy inodes on changed media.\n"); 894 895 if (bdops->revalidate_disk) 896 bdops->revalidate_disk(bdev->bd_disk); 897 if (bdev->bd_disk->minors > 1) 898 bdev->bd_invalidated = 1; 899 return 1; 900} 901 902EXPORT_SYMBOL(check_disk_change); 903 904void bd_set_size(struct block_device *bdev, loff_t size) 905{ 906 unsigned bsize = bdev_hardsect_size(bdev); 907 908 bdev->bd_inode->i_size = size; 909 while (bsize < PAGE_CACHE_SIZE) { 910 if (size & bsize) 911 break; 912 bsize <<= 1; 913 } 914 bdev->bd_block_size = bsize; 915 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 916} 917EXPORT_SYMBOL(bd_set_size); 918 919static int __blkdev_put(struct block_device *bdev, unsigned int subclass) 920{ 921 int ret = 0; 922 struct inode *bd_inode = bdev->bd_inode; 923 struct gendisk *disk = bdev->bd_disk; 924 925 mutex_lock_nested(&bdev->bd_mutex, subclass); 926 lock_kernel(); 927 if (!--bdev->bd_openers) { 928 sync_blockdev(bdev); 929 kill_bdev(bdev); 930 } 931 if (bdev->bd_contains == bdev) { 932 if (disk->fops->release) 933 ret = disk->fops->release(bd_inode, NULL); 934 } else { 935 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 936 subclass + 1); 937 bdev->bd_contains->bd_part_count--; 938 mutex_unlock(&bdev->bd_contains->bd_mutex); 939 } 940 if (!bdev->bd_openers) { 941 struct module *owner = disk->fops->owner; 942 943 put_disk(disk); 944 module_put(owner); 945 946 if (bdev->bd_contains != bdev) { 947 kobject_put(&bdev->bd_part->kobj); 948 bdev->bd_part = NULL; 949 } 950 bdev->bd_disk = NULL; 951 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 952 if (bdev != bdev->bd_contains) 953 __blkdev_put(bdev->bd_contains, subclass + 1); 954 bdev->bd_contains = NULL; 955 } 956 unlock_kernel(); 957 mutex_unlock(&bdev->bd_mutex); 958 bdput(bdev); 959 return ret; 960} 961 962int blkdev_put(struct block_device *bdev) 963{ 964 return __blkdev_put(bdev, BD_MUTEX_NORMAL); 965} 966EXPORT_SYMBOL(blkdev_put); 967 968int blkdev_put_partition(struct block_device *bdev) 969{ 970 return __blkdev_put(bdev, BD_MUTEX_PARTITION); 971} 972EXPORT_SYMBOL(blkdev_put_partition); 973 974static int 975blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); 976 977static int 978do_open(struct block_device *bdev, struct file *file, unsigned int subclass) 979{ 980 struct module *owner = NULL; 981 struct gendisk *disk; 982 int ret = -ENXIO; 983 int part; 984 985 file->f_mapping = bdev->bd_inode->i_mapping; 986 lock_kernel(); 987 disk = get_gendisk(bdev->bd_dev, &part); 988 if (!disk) { 989 unlock_kernel(); 990 bdput(bdev); 991 return ret; 992 } 993 owner = disk->fops->owner; 994 995 mutex_lock_nested(&bdev->bd_mutex, subclass); 996 997 if (!bdev->bd_openers) { 998 bdev->bd_disk = disk; 999 bdev->bd_contains = bdev; 1000 if (!part) { 1001 struct backing_dev_info *bdi; 1002 if (disk->fops->open) { 1003 ret = disk->fops->open(bdev->bd_inode, file); 1004 if (ret) 1005 goto out_first; 1006 } 1007 if (!bdev->bd_openers) { 1008 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1009 bdi = blk_get_backing_dev_info(bdev); 1010 if (bdi == NULL) 1011 bdi = &default_backing_dev_info; 1012 bdev->bd_inode->i_data.backing_dev_info = bdi; 1013 } 1014 if (bdev->bd_invalidated) 1015 rescan_partitions(disk, bdev); 1016 } else { 1017 struct hd_struct *p; 1018 struct block_device *whole; 1019 whole = bdget_disk(disk, 0); 1020 ret = -ENOMEM; 1021 if (!whole) 1022 goto out_first; 1023 ret = blkdev_get_whole(whole, file->f_mode, file->f_flags); 1024 if (ret) 1025 goto out_first; 1026 bdev->bd_contains = whole; 1027 mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE); 1028 whole->bd_part_count++; 1029 p = disk->part[part - 1]; 1030 bdev->bd_inode->i_data.backing_dev_info = 1031 whole->bd_inode->i_data.backing_dev_info; 1032 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1033 whole->bd_part_count--; 1034 mutex_unlock(&whole->bd_mutex); 1035 ret = -ENXIO; 1036 goto out_first; 1037 } 1038 kobject_get(&p->kobj); 1039 bdev->bd_part = p; 1040 bd_set_size(bdev, (loff_t) p->nr_sects << 9); 1041 mutex_unlock(&whole->bd_mutex); 1042 } 1043 } else { 1044 put_disk(disk); 1045 module_put(owner); 1046 if (bdev->bd_contains == bdev) { 1047 if (bdev->bd_disk->fops->open) { 1048 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1049 if (ret) 1050 goto out; 1051 } 1052 if (bdev->bd_invalidated) 1053 rescan_partitions(bdev->bd_disk, bdev); 1054 } else { 1055 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 1056 BD_MUTEX_WHOLE); 1057 bdev->bd_contains->bd_part_count++; 1058 mutex_unlock(&bdev->bd_contains->bd_mutex); 1059 } 1060 } 1061 bdev->bd_openers++; 1062 mutex_unlock(&bdev->bd_mutex); 1063 unlock_kernel(); 1064 return 0; 1065 1066out_first: 1067 bdev->bd_disk = NULL; 1068 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1069 if (bdev != bdev->bd_contains) 1070 __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); 1071 bdev->bd_contains = NULL; 1072 put_disk(disk); 1073 module_put(owner); 1074out: 1075 mutex_unlock(&bdev->bd_mutex); 1076 unlock_kernel(); 1077 if (ret) 1078 bdput(bdev); 1079 return ret; 1080} 1081 1082int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) 1083{ 1084 /* 1085 * This crockload is due to bad choice of ->open() type. 1086 * It will go away. 1087 * For now, block device ->open() routine must _not_ 1088 * examine anything in 'inode' argument except ->i_rdev. 1089 */ 1090 struct file fake_file = {}; 1091 struct dentry fake_dentry = {}; 1092 fake_file.f_mode = mode; 1093 fake_file.f_flags = flags; 1094 fake_file.f_dentry = &fake_dentry; 1095 fake_dentry.d_inode = bdev->bd_inode; 1096 1097 return do_open(bdev, &fake_file, BD_MUTEX_NORMAL); 1098} 1099 1100EXPORT_SYMBOL(blkdev_get); 1101 1102static int 1103blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags) 1104{ 1105 /* 1106 * This crockload is due to bad choice of ->open() type. 1107 * It will go away. 1108 * For now, block device ->open() routine must _not_ 1109 * examine anything in 'inode' argument except ->i_rdev. 1110 */ 1111 struct file fake_file = {}; 1112 struct dentry fake_dentry = {}; 1113 fake_file.f_mode = mode; 1114 fake_file.f_flags = flags; 1115 fake_file.f_dentry = &fake_dentry; 1116 fake_dentry.d_inode = bdev->bd_inode; 1117 1118 return do_open(bdev, &fake_file, BD_MUTEX_WHOLE); 1119} 1120 1121static int 1122blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags) 1123{ 1124 /* 1125 * This crockload is due to bad choice of ->open() type. 1126 * It will go away. 1127 * For now, block device ->open() routine must _not_ 1128 * examine anything in 'inode' argument except ->i_rdev. 1129 */ 1130 struct file fake_file = {}; 1131 struct dentry fake_dentry = {}; 1132 fake_file.f_mode = mode; 1133 fake_file.f_flags = flags; 1134 fake_file.f_dentry = &fake_dentry; 1135 fake_dentry.d_inode = bdev->bd_inode; 1136 1137 return do_open(bdev, &fake_file, BD_MUTEX_PARTITION); 1138} 1139 1140static int blkdev_open(struct inode * inode, struct file * filp) 1141{ 1142 struct block_device *bdev; 1143 int res; 1144 1145 /* 1146 * Preserve backwards compatibility and allow large file access 1147 * even if userspace doesn't ask for it explicitly. Some mkfs 1148 * binary needs it. We might want to drop this workaround 1149 * during an unstable branch. 1150 */ 1151 filp->f_flags |= O_LARGEFILE; 1152 1153 bdev = bd_acquire(inode); 1154 if (bdev == NULL) 1155 return -ENOMEM; 1156 1157 res = do_open(bdev, filp, BD_MUTEX_NORMAL); 1158 if (res) 1159 return res; 1160 1161 if (!(filp->f_flags & O_EXCL) ) 1162 return 0; 1163 1164 if (!(res = bd_claim(bdev, filp))) 1165 return 0; 1166 1167 blkdev_put(bdev); 1168 return res; 1169} 1170 1171static int blkdev_close(struct inode * inode, struct file * filp) 1172{ 1173 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1174 if (bdev->bd_holder == filp) 1175 bd_release(bdev); 1176 return blkdev_put(bdev); 1177} 1178 1179static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1180{ 1181 return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); 1182} 1183 1184const struct address_space_operations def_blk_aops = { 1185 .readpage = blkdev_readpage, 1186 .writepage = blkdev_writepage, 1187 .sync_page = block_sync_page, 1188 .prepare_write = blkdev_prepare_write, 1189 .commit_write = blkdev_commit_write, 1190 .writepages = generic_writepages, 1191 .direct_IO = blkdev_direct_IO, 1192}; 1193 1194const struct file_operations def_blk_fops = { 1195 .open = blkdev_open, 1196 .release = blkdev_close, 1197 .llseek = block_llseek, 1198 .read = do_sync_read, 1199 .write = do_sync_write, 1200 .aio_read = generic_file_aio_read, 1201 .aio_write = generic_file_aio_write_nolock, 1202 .mmap = generic_file_mmap, 1203 .fsync = block_fsync, 1204 .unlocked_ioctl = block_ioctl, 1205#ifdef CONFIG_COMPAT 1206 .compat_ioctl = compat_blkdev_ioctl, 1207#endif 1208 .sendfile = generic_file_sendfile, 1209 .splice_read = generic_file_splice_read, 1210 .splice_write = generic_file_splice_write, 1211}; 1212 1213int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1214{ 1215 int res; 1216 mm_segment_t old_fs = get_fs(); 1217 set_fs(KERNEL_DS); 1218 res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg); 1219 set_fs(old_fs); 1220 return res; 1221} 1222 1223EXPORT_SYMBOL(ioctl_by_bdev); 1224 1225/** 1226 * lookup_bdev - lookup a struct block_device by name 1227 * 1228 * @path: special file representing the block device 1229 * 1230 * Get a reference to the blockdevice at @path in the current 1231 * namespace if possible and return it. Return ERR_PTR(error) 1232 * otherwise. 1233 */ 1234struct block_device *lookup_bdev(const char *path) 1235{ 1236 struct block_device *bdev; 1237 struct inode *inode; 1238 struct nameidata nd; 1239 int error; 1240 1241 if (!path || !*path) 1242 return ERR_PTR(-EINVAL); 1243 1244 error = path_lookup(path, LOOKUP_FOLLOW, &nd); 1245 if (error) 1246 return ERR_PTR(error); 1247 1248 inode = nd.dentry->d_inode; 1249 error = -ENOTBLK; 1250 if (!S_ISBLK(inode->i_mode)) 1251 goto fail; 1252 error = -EACCES; 1253 if (nd.mnt->mnt_flags & MNT_NODEV) 1254 goto fail; 1255 error = -ENOMEM; 1256 bdev = bd_acquire(inode); 1257 if (!bdev) 1258 goto fail; 1259out: 1260 path_release(&nd); 1261 return bdev; 1262fail: 1263 bdev = ERR_PTR(error); 1264 goto out; 1265} 1266 1267/** 1268 * open_bdev_excl - open a block device by name and set it up for use 1269 * 1270 * @path: special file representing the block device 1271 * @flags: %MS_RDONLY for opening read-only 1272 * @holder: owner for exclusion 1273 * 1274 * Open the blockdevice described by the special file at @path, claim it 1275 * for the @holder. 1276 */ 1277struct block_device *open_bdev_excl(const char *path, int flags, void *holder) 1278{ 1279 struct block_device *bdev; 1280 mode_t mode = FMODE_READ; 1281 int error = 0; 1282 1283 bdev = lookup_bdev(path); 1284 if (IS_ERR(bdev)) 1285 return bdev; 1286 1287 if (!(flags & MS_RDONLY)) 1288 mode |= FMODE_WRITE; 1289 error = blkdev_get(bdev, mode, 0); 1290 if (error) 1291 return ERR_PTR(error); 1292 error = -EACCES; 1293 if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) 1294 goto blkdev_put; 1295 error = bd_claim(bdev, holder); 1296 if (error) 1297 goto blkdev_put; 1298 1299 return bdev; 1300 1301blkdev_put: 1302 blkdev_put(bdev); 1303 return ERR_PTR(error); 1304} 1305 1306EXPORT_SYMBOL(open_bdev_excl); 1307 1308/** 1309 * close_bdev_excl - release a blockdevice openen by open_bdev_excl() 1310 * 1311 * @bdev: blockdevice to close 1312 * 1313 * This is the counterpart to open_bdev_excl(). 1314 */ 1315void close_bdev_excl(struct block_device *bdev) 1316{ 1317 bd_release(bdev); 1318 blkdev_put(bdev); 1319} 1320 1321EXPORT_SYMBOL(close_bdev_excl); 1322 1323int __invalidate_device(struct block_device *bdev) 1324{ 1325 struct super_block *sb = get_super(bdev); 1326 int res = 0; 1327 1328 if (sb) { 1329 /* 1330 * no need to lock the super, get_super holds the 1331 * read mutex so the filesystem cannot go away 1332 * under us (->put_super runs with the write lock 1333 * hold). 1334 */ 1335 shrink_dcache_sb(sb); 1336 res = invalidate_inodes(sb); 1337 drop_super(sb); 1338 } 1339 invalidate_bdev(bdev, 0); 1340 return res; 1341} 1342EXPORT_SYMBOL(__invalidate_device);