at v2.6.18 1305 lines 32 kB view raw
1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/smp_lock.h> 15#include <linux/highmem.h> 16#include <linux/blkdev.h> 17#include <linux/module.h> 18#include <linux/blkpg.h> 19#include <linux/buffer_head.h> 20#include <linux/mpage.h> 21#include <linux/mount.h> 22#include <linux/uio.h> 23#include <linux/namei.h> 24#include <asm/uaccess.h> 25 26struct bdev_inode { 27 struct block_device bdev; 28 struct inode vfs_inode; 29}; 30 31static inline struct bdev_inode *BDEV_I(struct inode *inode) 32{ 33 return container_of(inode, struct bdev_inode, vfs_inode); 34} 35 36inline struct block_device *I_BDEV(struct inode *inode) 37{ 38 return &BDEV_I(inode)->bdev; 39} 40 41EXPORT_SYMBOL(I_BDEV); 42 43static sector_t max_block(struct block_device *bdev) 44{ 45 sector_t retval = ~((sector_t)0); 46 loff_t sz = i_size_read(bdev->bd_inode); 47 48 if (sz) { 49 unsigned int size = block_size(bdev); 50 unsigned int sizebits = blksize_bits(size); 51 retval = (sz >> sizebits); 52 } 53 return retval; 54} 55 56/* Kill _all_ buffers, dirty or not.. */ 57static void kill_bdev(struct block_device *bdev) 58{ 59 invalidate_bdev(bdev, 1); 60 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 61} 62 63int set_blocksize(struct block_device *bdev, int size) 64{ 65 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 66 if (size > PAGE_SIZE || size < 512 || (size & (size-1))) 67 return -EINVAL; 68 69 /* Size cannot be smaller than the size supported by the device */ 70 if (size < bdev_hardsect_size(bdev)) 71 return -EINVAL; 72 73 /* Don't change the size if it is same as current */ 74 if (bdev->bd_block_size != size) { 75 sync_blockdev(bdev); 76 bdev->bd_block_size = size; 77 bdev->bd_inode->i_blkbits = blksize_bits(size); 78 kill_bdev(bdev); 79 } 80 return 0; 81} 82 83EXPORT_SYMBOL(set_blocksize); 84 85int sb_set_blocksize(struct super_block *sb, int size) 86{ 87 if (set_blocksize(sb->s_bdev, size)) 88 return 0; 89 /* If we get here, we know size is power of two 90 * and it's value is between 512 and PAGE_SIZE */ 91 sb->s_blocksize = size; 92 sb->s_blocksize_bits = blksize_bits(size); 93 return sb->s_blocksize; 94} 95 96EXPORT_SYMBOL(sb_set_blocksize); 97 98int sb_min_blocksize(struct super_block *sb, int size) 99{ 100 int minsize = bdev_hardsect_size(sb->s_bdev); 101 if (size < minsize) 102 size = minsize; 103 return sb_set_blocksize(sb, size); 104} 105 106EXPORT_SYMBOL(sb_min_blocksize); 107 108static int 109blkdev_get_block(struct inode *inode, sector_t iblock, 110 struct buffer_head *bh, int create) 111{ 112 if (iblock >= max_block(I_BDEV(inode))) { 113 if (create) 114 return -EIO; 115 116 /* 117 * for reads, we're just trying to fill a partial page. 118 * return a hole, they will have to call get_block again 119 * before they can fill it, and they will get -EIO at that 120 * time 121 */ 122 return 0; 123 } 124 bh->b_bdev = I_BDEV(inode); 125 bh->b_blocknr = iblock; 126 set_buffer_mapped(bh); 127 return 0; 128} 129 130static int 131blkdev_get_blocks(struct inode *inode, sector_t iblock, 132 struct buffer_head *bh, int create) 133{ 134 sector_t end_block = max_block(I_BDEV(inode)); 135 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 136 137 if ((iblock + max_blocks) > end_block) { 138 max_blocks = end_block - iblock; 139 if ((long)max_blocks <= 0) { 140 if (create) 141 return -EIO; /* write fully beyond EOF */ 142 /* 143 * It is a read which is fully beyond EOF. We return 144 * a !buffer_mapped buffer 145 */ 146 max_blocks = 0; 147 } 148 } 149 150 bh->b_bdev = I_BDEV(inode); 151 bh->b_blocknr = iblock; 152 bh->b_size = max_blocks << inode->i_blkbits; 153 if (max_blocks) 154 set_buffer_mapped(bh); 155 return 0; 156} 157 158static ssize_t 159blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 160 loff_t offset, unsigned long nr_segs) 161{ 162 struct file *file = iocb->ki_filp; 163 struct inode *inode = file->f_mapping->host; 164 165 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 166 iov, offset, nr_segs, blkdev_get_blocks, NULL); 167} 168 169static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 170{ 171 return block_write_full_page(page, blkdev_get_block, wbc); 172} 173 174static int blkdev_readpage(struct file * file, struct page * page) 175{ 176 return block_read_full_page(page, blkdev_get_block); 177} 178 179static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) 180{ 181 return block_prepare_write(page, from, to, blkdev_get_block); 182} 183 184static int blkdev_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) 185{ 186 return block_commit_write(page, from, to); 187} 188 189/* 190 * private llseek: 191 * for a block special file file->f_dentry->d_inode->i_size is zero 192 * so we compute the size by hand (just as in block_read/write above) 193 */ 194static loff_t block_llseek(struct file *file, loff_t offset, int origin) 195{ 196 struct inode *bd_inode = file->f_mapping->host; 197 loff_t size; 198 loff_t retval; 199 200 mutex_lock(&bd_inode->i_mutex); 201 size = i_size_read(bd_inode); 202 203 switch (origin) { 204 case 2: 205 offset += size; 206 break; 207 case 1: 208 offset += file->f_pos; 209 } 210 retval = -EINVAL; 211 if (offset >= 0 && offset <= size) { 212 if (offset != file->f_pos) { 213 file->f_pos = offset; 214 } 215 retval = offset; 216 } 217 mutex_unlock(&bd_inode->i_mutex); 218 return retval; 219} 220 221/* 222 * Filp is never NULL; the only case when ->fsync() is called with 223 * NULL first argument is nfsd_sync_dir() and that's not a directory. 224 */ 225 226static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 227{ 228 return sync_blockdev(I_BDEV(filp->f_mapping->host)); 229} 230 231/* 232 * pseudo-fs 233 */ 234 235static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 236static kmem_cache_t * bdev_cachep __read_mostly; 237 238static struct inode *bdev_alloc_inode(struct super_block *sb) 239{ 240 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL); 241 if (!ei) 242 return NULL; 243 return &ei->vfs_inode; 244} 245 246static void bdev_destroy_inode(struct inode *inode) 247{ 248 struct bdev_inode *bdi = BDEV_I(inode); 249 250 bdi->bdev.bd_inode_backing_dev_info = NULL; 251 kmem_cache_free(bdev_cachep, bdi); 252} 253 254static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) 255{ 256 struct bdev_inode *ei = (struct bdev_inode *) foo; 257 struct block_device *bdev = &ei->bdev; 258 259 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 260 SLAB_CTOR_CONSTRUCTOR) 261 { 262 memset(bdev, 0, sizeof(*bdev)); 263 mutex_init(&bdev->bd_mutex); 264 mutex_init(&bdev->bd_mount_mutex); 265 INIT_LIST_HEAD(&bdev->bd_inodes); 266 INIT_LIST_HEAD(&bdev->bd_list); 267#ifdef CONFIG_SYSFS 268 INIT_LIST_HEAD(&bdev->bd_holder_list); 269#endif 270 inode_init_once(&ei->vfs_inode); 271 } 272} 273 274static inline void __bd_forget(struct inode *inode) 275{ 276 list_del_init(&inode->i_devices); 277 inode->i_bdev = NULL; 278 inode->i_mapping = &inode->i_data; 279} 280 281static void bdev_clear_inode(struct inode *inode) 282{ 283 struct block_device *bdev = &BDEV_I(inode)->bdev; 284 struct list_head *p; 285 spin_lock(&bdev_lock); 286 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 287 __bd_forget(list_entry(p, struct inode, i_devices)); 288 } 289 list_del_init(&bdev->bd_list); 290 spin_unlock(&bdev_lock); 291} 292 293static struct super_operations bdev_sops = { 294 .statfs = simple_statfs, 295 .alloc_inode = bdev_alloc_inode, 296 .destroy_inode = bdev_destroy_inode, 297 .drop_inode = generic_delete_inode, 298 .clear_inode = bdev_clear_inode, 299}; 300 301static int bd_get_sb(struct file_system_type *fs_type, 302 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 303{ 304 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 305} 306 307static struct file_system_type bd_type = { 308 .name = "bdev", 309 .get_sb = bd_get_sb, 310 .kill_sb = kill_anon_super, 311}; 312 313static struct vfsmount *bd_mnt __read_mostly; 314struct super_block *blockdev_superblock; 315 316void __init bdev_cache_init(void) 317{ 318 int err; 319 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 320 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 321 SLAB_MEM_SPREAD|SLAB_PANIC), 322 init_once, NULL); 323 err = register_filesystem(&bd_type); 324 if (err) 325 panic("Cannot register bdev pseudo-fs"); 326 bd_mnt = kern_mount(&bd_type); 327 err = PTR_ERR(bd_mnt); 328 if (IS_ERR(bd_mnt)) 329 panic("Cannot create bdev pseudo-fs"); 330 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 331} 332 333/* 334 * Most likely _very_ bad one - but then it's hardly critical for small 335 * /dev and can be fixed when somebody will need really large one. 336 * Keep in mind that it will be fed through icache hash function too. 337 */ 338static inline unsigned long hash(dev_t dev) 339{ 340 return MAJOR(dev)+MINOR(dev); 341} 342 343static int bdev_test(struct inode *inode, void *data) 344{ 345 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 346} 347 348static int bdev_set(struct inode *inode, void *data) 349{ 350 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 351 return 0; 352} 353 354static LIST_HEAD(all_bdevs); 355 356struct block_device *bdget(dev_t dev) 357{ 358 struct block_device *bdev; 359 struct inode *inode; 360 361 inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), 362 bdev_test, bdev_set, &dev); 363 364 if (!inode) 365 return NULL; 366 367 bdev = &BDEV_I(inode)->bdev; 368 369 if (inode->i_state & I_NEW) { 370 bdev->bd_contains = NULL; 371 bdev->bd_inode = inode; 372 bdev->bd_block_size = (1 << inode->i_blkbits); 373 bdev->bd_part_count = 0; 374 bdev->bd_invalidated = 0; 375 inode->i_mode = S_IFBLK; 376 inode->i_rdev = dev; 377 inode->i_bdev = bdev; 378 inode->i_data.a_ops = &def_blk_aops; 379 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 380 inode->i_data.backing_dev_info = &default_backing_dev_info; 381 spin_lock(&bdev_lock); 382 list_add(&bdev->bd_list, &all_bdevs); 383 spin_unlock(&bdev_lock); 384 unlock_new_inode(inode); 385 } 386 return bdev; 387} 388 389EXPORT_SYMBOL(bdget); 390 391long nr_blockdev_pages(void) 392{ 393 struct list_head *p; 394 long ret = 0; 395 spin_lock(&bdev_lock); 396 list_for_each(p, &all_bdevs) { 397 struct block_device *bdev; 398 bdev = list_entry(p, struct block_device, bd_list); 399 ret += bdev->bd_inode->i_mapping->nrpages; 400 } 401 spin_unlock(&bdev_lock); 402 return ret; 403} 404 405void bdput(struct block_device *bdev) 406{ 407 iput(bdev->bd_inode); 408} 409 410EXPORT_SYMBOL(bdput); 411 412static struct block_device *bd_acquire(struct inode *inode) 413{ 414 struct block_device *bdev; 415 416 spin_lock(&bdev_lock); 417 bdev = inode->i_bdev; 418 if (bdev) { 419 atomic_inc(&bdev->bd_inode->i_count); 420 spin_unlock(&bdev_lock); 421 return bdev; 422 } 423 spin_unlock(&bdev_lock); 424 425 bdev = bdget(inode->i_rdev); 426 if (bdev) { 427 spin_lock(&bdev_lock); 428 if (!inode->i_bdev) { 429 /* 430 * We take an additional bd_inode->i_count for inode, 431 * and it's released in clear_inode() of inode. 432 * So, we can access it via ->i_mapping always 433 * without igrab(). 434 */ 435 atomic_inc(&bdev->bd_inode->i_count); 436 inode->i_bdev = bdev; 437 inode->i_mapping = bdev->bd_inode->i_mapping; 438 list_add(&inode->i_devices, &bdev->bd_inodes); 439 } 440 spin_unlock(&bdev_lock); 441 } 442 return bdev; 443} 444 445/* Call when you free inode */ 446 447void bd_forget(struct inode *inode) 448{ 449 struct block_device *bdev = NULL; 450 451 spin_lock(&bdev_lock); 452 if (inode->i_bdev) { 453 if (inode->i_sb != blockdev_superblock) 454 bdev = inode->i_bdev; 455 __bd_forget(inode); 456 } 457 spin_unlock(&bdev_lock); 458 459 if (bdev) 460 iput(bdev->bd_inode); 461} 462 463int bd_claim(struct block_device *bdev, void *holder) 464{ 465 int res; 466 spin_lock(&bdev_lock); 467 468 /* first decide result */ 469 if (bdev->bd_holder == holder) 470 res = 0; /* already a holder */ 471 else if (bdev->bd_holder != NULL) 472 res = -EBUSY; /* held by someone else */ 473 else if (bdev->bd_contains == bdev) 474 res = 0; /* is a whole device which isn't held */ 475 476 else if (bdev->bd_contains->bd_holder == bd_claim) 477 res = 0; /* is a partition of a device that is being partitioned */ 478 else if (bdev->bd_contains->bd_holder != NULL) 479 res = -EBUSY; /* is a partition of a held device */ 480 else 481 res = 0; /* is a partition of an un-held device */ 482 483 /* now impose change */ 484 if (res==0) { 485 /* note that for a whole device bd_holders 486 * will be incremented twice, and bd_holder will 487 * be set to bd_claim before being set to holder 488 */ 489 bdev->bd_contains->bd_holders ++; 490 bdev->bd_contains->bd_holder = bd_claim; 491 bdev->bd_holders++; 492 bdev->bd_holder = holder; 493 } 494 spin_unlock(&bdev_lock); 495 return res; 496} 497 498EXPORT_SYMBOL(bd_claim); 499 500void bd_release(struct block_device *bdev) 501{ 502 spin_lock(&bdev_lock); 503 if (!--bdev->bd_contains->bd_holders) 504 bdev->bd_contains->bd_holder = NULL; 505 if (!--bdev->bd_holders) 506 bdev->bd_holder = NULL; 507 spin_unlock(&bdev_lock); 508} 509 510EXPORT_SYMBOL(bd_release); 511 512#ifdef CONFIG_SYSFS 513/* 514 * Functions for bd_claim_by_kobject / bd_release_from_kobject 515 * 516 * If a kobject is passed to bd_claim_by_kobject() 517 * and the kobject has a parent directory, 518 * following symlinks are created: 519 * o from the kobject to the claimed bdev 520 * o from "holders" directory of the bdev to the parent of the kobject 521 * bd_release_from_kobject() removes these symlinks. 522 * 523 * Example: 524 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 525 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 526 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 527 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 528 */ 529 530static struct kobject *bdev_get_kobj(struct block_device *bdev) 531{ 532 if (bdev->bd_contains != bdev) 533 return kobject_get(&bdev->bd_part->kobj); 534 else 535 return kobject_get(&bdev->bd_disk->kobj); 536} 537 538static struct kobject *bdev_get_holder(struct block_device *bdev) 539{ 540 if (bdev->bd_contains != bdev) 541 return kobject_get(bdev->bd_part->holder_dir); 542 else 543 return kobject_get(bdev->bd_disk->holder_dir); 544} 545 546static void add_symlink(struct kobject *from, struct kobject *to) 547{ 548 if (!from || !to) 549 return; 550 sysfs_create_link(from, to, kobject_name(to)); 551} 552 553static void del_symlink(struct kobject *from, struct kobject *to) 554{ 555 if (!from || !to) 556 return; 557 sysfs_remove_link(from, kobject_name(to)); 558} 559 560/* 561 * 'struct bd_holder' contains pointers to kobjects symlinked by 562 * bd_claim_by_kobject. 563 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 564 */ 565struct bd_holder { 566 struct list_head list; /* chain of holders of the bdev */ 567 int count; /* references from the holder */ 568 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 569 struct kobject *hdev; /* e.g. "/block/dm-0" */ 570 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 571 struct kobject *sdev; /* e.g. "/block/sda" */ 572}; 573 574/* 575 * Get references of related kobjects at once. 576 * Returns 1 on success. 0 on failure. 577 * 578 * Should call bd_holder_release_dirs() after successful use. 579 */ 580static int bd_holder_grab_dirs(struct block_device *bdev, 581 struct bd_holder *bo) 582{ 583 if (!bdev || !bo) 584 return 0; 585 586 bo->sdir = kobject_get(bo->sdir); 587 if (!bo->sdir) 588 return 0; 589 590 bo->hdev = kobject_get(bo->sdir->parent); 591 if (!bo->hdev) 592 goto fail_put_sdir; 593 594 bo->sdev = bdev_get_kobj(bdev); 595 if (!bo->sdev) 596 goto fail_put_hdev; 597 598 bo->hdir = bdev_get_holder(bdev); 599 if (!bo->hdir) 600 goto fail_put_sdev; 601 602 return 1; 603 604fail_put_sdev: 605 kobject_put(bo->sdev); 606fail_put_hdev: 607 kobject_put(bo->hdev); 608fail_put_sdir: 609 kobject_put(bo->sdir); 610 611 return 0; 612} 613 614/* Put references of related kobjects at once. */ 615static void bd_holder_release_dirs(struct bd_holder *bo) 616{ 617 kobject_put(bo->hdir); 618 kobject_put(bo->sdev); 619 kobject_put(bo->hdev); 620 kobject_put(bo->sdir); 621} 622 623static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 624{ 625 struct bd_holder *bo; 626 627 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 628 if (!bo) 629 return NULL; 630 631 bo->count = 1; 632 bo->sdir = kobj; 633 634 return bo; 635} 636 637static void free_bd_holder(struct bd_holder *bo) 638{ 639 kfree(bo); 640} 641 642/** 643 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 644 * 645 * @bdev: block device to be bd_claimed 646 * @bo: preallocated and initialized by alloc_bd_holder() 647 * 648 * If there is no matching entry with @bo in @bdev->bd_holder_list, 649 * add @bo to the list, create symlinks. 650 * 651 * Returns 1 if @bo was added to the list. 652 * Returns 0 if @bo wasn't used by any reason and should be freed. 653 */ 654static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 655{ 656 struct bd_holder *tmp; 657 658 if (!bo) 659 return 0; 660 661 list_for_each_entry(tmp, &bdev->bd_holder_list, list) { 662 if (tmp->sdir == bo->sdir) { 663 tmp->count++; 664 return 0; 665 } 666 } 667 668 if (!bd_holder_grab_dirs(bdev, bo)) 669 return 0; 670 671 add_symlink(bo->sdir, bo->sdev); 672 add_symlink(bo->hdir, bo->hdev); 673 list_add_tail(&bo->list, &bdev->bd_holder_list); 674 return 1; 675} 676 677/** 678 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 679 * 680 * @bdev: block device to be bd_claimed 681 * @kobj: holder's kobject 682 * 683 * If there is matching entry with @kobj in @bdev->bd_holder_list 684 * and no other bd_claim() from the same kobject, 685 * remove the struct bd_holder from the list, delete symlinks for it. 686 * 687 * Returns a pointer to the struct bd_holder when it's removed from the list 688 * and ready to be freed. 689 * Returns NULL if matching claim isn't found or there is other bd_claim() 690 * by the same kobject. 691 */ 692static struct bd_holder *del_bd_holder(struct block_device *bdev, 693 struct kobject *kobj) 694{ 695 struct bd_holder *bo; 696 697 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 698 if (bo->sdir == kobj) { 699 bo->count--; 700 BUG_ON(bo->count < 0); 701 if (!bo->count) { 702 list_del(&bo->list); 703 del_symlink(bo->sdir, bo->sdev); 704 del_symlink(bo->hdir, bo->hdev); 705 bd_holder_release_dirs(bo); 706 return bo; 707 } 708 break; 709 } 710 } 711 712 return NULL; 713} 714 715/** 716 * bd_claim_by_kobject - bd_claim() with additional kobject signature 717 * 718 * @bdev: block device to be claimed 719 * @holder: holder's signature 720 * @kobj: holder's kobject 721 * 722 * Do bd_claim() and if it succeeds, create sysfs symlinks between 723 * the bdev and the holder's kobject. 724 * Use bd_release_from_kobject() when relesing the claimed bdev. 725 * 726 * Returns 0 on success. (same as bd_claim()) 727 * Returns errno on failure. 728 */ 729static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 730 struct kobject *kobj) 731{ 732 int res; 733 struct bd_holder *bo; 734 735 if (!kobj) 736 return -EINVAL; 737 738 bo = alloc_bd_holder(kobj); 739 if (!bo) 740 return -ENOMEM; 741 742 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 743 res = bd_claim(bdev, holder); 744 if (res || !add_bd_holder(bdev, bo)) 745 free_bd_holder(bo); 746 mutex_unlock(&bdev->bd_mutex); 747 748 return res; 749} 750 751/** 752 * bd_release_from_kobject - bd_release() with additional kobject signature 753 * 754 * @bdev: block device to be released 755 * @kobj: holder's kobject 756 * 757 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 758 */ 759static void bd_release_from_kobject(struct block_device *bdev, 760 struct kobject *kobj) 761{ 762 struct bd_holder *bo; 763 764 if (!kobj) 765 return; 766 767 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); 768 bd_release(bdev); 769 if ((bo = del_bd_holder(bdev, kobj))) 770 free_bd_holder(bo); 771 mutex_unlock(&bdev->bd_mutex); 772} 773 774/** 775 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 776 * 777 * @bdev: block device to be claimed 778 * @holder: holder's signature 779 * @disk: holder's gendisk 780 * 781 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 782 */ 783int bd_claim_by_disk(struct block_device *bdev, void *holder, 784 struct gendisk *disk) 785{ 786 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 787} 788EXPORT_SYMBOL_GPL(bd_claim_by_disk); 789 790/** 791 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 792 * 793 * @bdev: block device to be claimed 794 * @disk: holder's gendisk 795 * 796 * Call bd_release_from_kobject() and put @disk->slave_dir. 797 */ 798void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 799{ 800 bd_release_from_kobject(bdev, disk->slave_dir); 801 kobject_put(disk->slave_dir); 802} 803EXPORT_SYMBOL_GPL(bd_release_from_disk); 804#endif 805 806/* 807 * Tries to open block device by device number. Use it ONLY if you 808 * really do not have anything better - i.e. when you are behind a 809 * truly sucky interface and all you are given is a device number. _Never_ 810 * to be used for internal purposes. If you ever need it - reconsider 811 * your API. 812 */ 813struct block_device *open_by_devnum(dev_t dev, unsigned mode) 814{ 815 struct block_device *bdev = bdget(dev); 816 int err = -ENOMEM; 817 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 818 if (bdev) 819 err = blkdev_get(bdev, mode, flags); 820 return err ? ERR_PTR(err) : bdev; 821} 822 823EXPORT_SYMBOL(open_by_devnum); 824 825static int 826blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags); 827 828struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode) 829{ 830 struct block_device *bdev = bdget(dev); 831 int err = -ENOMEM; 832 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; 833 if (bdev) 834 err = blkdev_get_partition(bdev, mode, flags); 835 return err ? ERR_PTR(err) : bdev; 836} 837 838EXPORT_SYMBOL(open_partition_by_devnum); 839 840 841/* 842 * This routine checks whether a removable media has been changed, 843 * and invalidates all buffer-cache-entries in that case. This 844 * is a relatively slow routine, so we have to try to minimize using 845 * it. Thus it is called only upon a 'mount' or 'open'. This 846 * is the best way of combining speed and utility, I think. 847 * People changing diskettes in the middle of an operation deserve 848 * to lose :-) 849 */ 850int check_disk_change(struct block_device *bdev) 851{ 852 struct gendisk *disk = bdev->bd_disk; 853 struct block_device_operations * bdops = disk->fops; 854 855 if (!bdops->media_changed) 856 return 0; 857 if (!bdops->media_changed(bdev->bd_disk)) 858 return 0; 859 860 if (__invalidate_device(bdev)) 861 printk("VFS: busy inodes on changed media.\n"); 862 863 if (bdops->revalidate_disk) 864 bdops->revalidate_disk(bdev->bd_disk); 865 if (bdev->bd_disk->minors > 1) 866 bdev->bd_invalidated = 1; 867 return 1; 868} 869 870EXPORT_SYMBOL(check_disk_change); 871 872void bd_set_size(struct block_device *bdev, loff_t size) 873{ 874 unsigned bsize = bdev_hardsect_size(bdev); 875 876 bdev->bd_inode->i_size = size; 877 while (bsize < PAGE_CACHE_SIZE) { 878 if (size & bsize) 879 break; 880 bsize <<= 1; 881 } 882 bdev->bd_block_size = bsize; 883 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 884} 885EXPORT_SYMBOL(bd_set_size); 886 887static int __blkdev_put(struct block_device *bdev, unsigned int subclass) 888{ 889 int ret = 0; 890 struct inode *bd_inode = bdev->bd_inode; 891 struct gendisk *disk = bdev->bd_disk; 892 893 mutex_lock_nested(&bdev->bd_mutex, subclass); 894 lock_kernel(); 895 if (!--bdev->bd_openers) { 896 sync_blockdev(bdev); 897 kill_bdev(bdev); 898 } 899 if (bdev->bd_contains == bdev) { 900 if (disk->fops->release) 901 ret = disk->fops->release(bd_inode, NULL); 902 } else { 903 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 904 subclass + 1); 905 bdev->bd_contains->bd_part_count--; 906 mutex_unlock(&bdev->bd_contains->bd_mutex); 907 } 908 if (!bdev->bd_openers) { 909 struct module *owner = disk->fops->owner; 910 911 put_disk(disk); 912 module_put(owner); 913 914 if (bdev->bd_contains != bdev) { 915 kobject_put(&bdev->bd_part->kobj); 916 bdev->bd_part = NULL; 917 } 918 bdev->bd_disk = NULL; 919 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 920 if (bdev != bdev->bd_contains) 921 __blkdev_put(bdev->bd_contains, subclass + 1); 922 bdev->bd_contains = NULL; 923 } 924 unlock_kernel(); 925 mutex_unlock(&bdev->bd_mutex); 926 bdput(bdev); 927 return ret; 928} 929 930int blkdev_put(struct block_device *bdev) 931{ 932 return __blkdev_put(bdev, BD_MUTEX_NORMAL); 933} 934EXPORT_SYMBOL(blkdev_put); 935 936int blkdev_put_partition(struct block_device *bdev) 937{ 938 return __blkdev_put(bdev, BD_MUTEX_PARTITION); 939} 940EXPORT_SYMBOL(blkdev_put_partition); 941 942static int 943blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); 944 945static int 946do_open(struct block_device *bdev, struct file *file, unsigned int subclass) 947{ 948 struct module *owner = NULL; 949 struct gendisk *disk; 950 int ret = -ENXIO; 951 int part; 952 953 file->f_mapping = bdev->bd_inode->i_mapping; 954 lock_kernel(); 955 disk = get_gendisk(bdev->bd_dev, &part); 956 if (!disk) { 957 unlock_kernel(); 958 bdput(bdev); 959 return ret; 960 } 961 owner = disk->fops->owner; 962 963 mutex_lock_nested(&bdev->bd_mutex, subclass); 964 965 if (!bdev->bd_openers) { 966 bdev->bd_disk = disk; 967 bdev->bd_contains = bdev; 968 if (!part) { 969 struct backing_dev_info *bdi; 970 if (disk->fops->open) { 971 ret = disk->fops->open(bdev->bd_inode, file); 972 if (ret) 973 goto out_first; 974 } 975 if (!bdev->bd_openers) { 976 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 977 bdi = blk_get_backing_dev_info(bdev); 978 if (bdi == NULL) 979 bdi = &default_backing_dev_info; 980 bdev->bd_inode->i_data.backing_dev_info = bdi; 981 } 982 if (bdev->bd_invalidated) 983 rescan_partitions(disk, bdev); 984 } else { 985 struct hd_struct *p; 986 struct block_device *whole; 987 whole = bdget_disk(disk, 0); 988 ret = -ENOMEM; 989 if (!whole) 990 goto out_first; 991 ret = blkdev_get_whole(whole, file->f_mode, file->f_flags); 992 if (ret) 993 goto out_first; 994 bdev->bd_contains = whole; 995 mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE); 996 whole->bd_part_count++; 997 p = disk->part[part - 1]; 998 bdev->bd_inode->i_data.backing_dev_info = 999 whole->bd_inode->i_data.backing_dev_info; 1000 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1001 whole->bd_part_count--; 1002 mutex_unlock(&whole->bd_mutex); 1003 ret = -ENXIO; 1004 goto out_first; 1005 } 1006 kobject_get(&p->kobj); 1007 bdev->bd_part = p; 1008 bd_set_size(bdev, (loff_t) p->nr_sects << 9); 1009 mutex_unlock(&whole->bd_mutex); 1010 } 1011 } else { 1012 put_disk(disk); 1013 module_put(owner); 1014 if (bdev->bd_contains == bdev) { 1015 if (bdev->bd_disk->fops->open) { 1016 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1017 if (ret) 1018 goto out; 1019 } 1020 if (bdev->bd_invalidated) 1021 rescan_partitions(bdev->bd_disk, bdev); 1022 } else { 1023 mutex_lock_nested(&bdev->bd_contains->bd_mutex, 1024 BD_MUTEX_PARTITION); 1025 bdev->bd_contains->bd_part_count++; 1026 mutex_unlock(&bdev->bd_contains->bd_mutex); 1027 } 1028 } 1029 bdev->bd_openers++; 1030 mutex_unlock(&bdev->bd_mutex); 1031 unlock_kernel(); 1032 return 0; 1033 1034out_first: 1035 bdev->bd_disk = NULL; 1036 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1037 if (bdev != bdev->bd_contains) 1038 __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); 1039 bdev->bd_contains = NULL; 1040 put_disk(disk); 1041 module_put(owner); 1042out: 1043 mutex_unlock(&bdev->bd_mutex); 1044 unlock_kernel(); 1045 if (ret) 1046 bdput(bdev); 1047 return ret; 1048} 1049 1050int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) 1051{ 1052 /* 1053 * This crockload is due to bad choice of ->open() type. 1054 * It will go away. 1055 * For now, block device ->open() routine must _not_ 1056 * examine anything in 'inode' argument except ->i_rdev. 1057 */ 1058 struct file fake_file = {}; 1059 struct dentry fake_dentry = {}; 1060 fake_file.f_mode = mode; 1061 fake_file.f_flags = flags; 1062 fake_file.f_dentry = &fake_dentry; 1063 fake_dentry.d_inode = bdev->bd_inode; 1064 1065 return do_open(bdev, &fake_file, BD_MUTEX_NORMAL); 1066} 1067 1068EXPORT_SYMBOL(blkdev_get); 1069 1070static int 1071blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags) 1072{ 1073 /* 1074 * This crockload is due to bad choice of ->open() type. 1075 * It will go away. 1076 * For now, block device ->open() routine must _not_ 1077 * examine anything in 'inode' argument except ->i_rdev. 1078 */ 1079 struct file fake_file = {}; 1080 struct dentry fake_dentry = {}; 1081 fake_file.f_mode = mode; 1082 fake_file.f_flags = flags; 1083 fake_file.f_dentry = &fake_dentry; 1084 fake_dentry.d_inode = bdev->bd_inode; 1085 1086 return do_open(bdev, &fake_file, BD_MUTEX_WHOLE); 1087} 1088 1089static int 1090blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags) 1091{ 1092 /* 1093 * This crockload is due to bad choice of ->open() type. 1094 * It will go away. 1095 * For now, block device ->open() routine must _not_ 1096 * examine anything in 'inode' argument except ->i_rdev. 1097 */ 1098 struct file fake_file = {}; 1099 struct dentry fake_dentry = {}; 1100 fake_file.f_mode = mode; 1101 fake_file.f_flags = flags; 1102 fake_file.f_dentry = &fake_dentry; 1103 fake_dentry.d_inode = bdev->bd_inode; 1104 1105 return do_open(bdev, &fake_file, BD_MUTEX_PARTITION); 1106} 1107 1108static int blkdev_open(struct inode * inode, struct file * filp) 1109{ 1110 struct block_device *bdev; 1111 int res; 1112 1113 /* 1114 * Preserve backwards compatibility and allow large file access 1115 * even if userspace doesn't ask for it explicitly. Some mkfs 1116 * binary needs it. We might want to drop this workaround 1117 * during an unstable branch. 1118 */ 1119 filp->f_flags |= O_LARGEFILE; 1120 1121 bdev = bd_acquire(inode); 1122 1123 res = do_open(bdev, filp, BD_MUTEX_NORMAL); 1124 if (res) 1125 return res; 1126 1127 if (!(filp->f_flags & O_EXCL) ) 1128 return 0; 1129 1130 if (!(res = bd_claim(bdev, filp))) 1131 return 0; 1132 1133 blkdev_put(bdev); 1134 return res; 1135} 1136 1137static int blkdev_close(struct inode * inode, struct file * filp) 1138{ 1139 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1140 if (bdev->bd_holder == filp) 1141 bd_release(bdev); 1142 return blkdev_put(bdev); 1143} 1144 1145static ssize_t blkdev_file_write(struct file *file, const char __user *buf, 1146 size_t count, loff_t *ppos) 1147{ 1148 struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; 1149 1150 return generic_file_write_nolock(file, &local_iov, 1, ppos); 1151} 1152 1153static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf, 1154 size_t count, loff_t pos) 1155{ 1156 struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; 1157 1158 return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1159} 1160 1161static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1162{ 1163 return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); 1164} 1165 1166const struct address_space_operations def_blk_aops = { 1167 .readpage = blkdev_readpage, 1168 .writepage = blkdev_writepage, 1169 .sync_page = block_sync_page, 1170 .prepare_write = blkdev_prepare_write, 1171 .commit_write = blkdev_commit_write, 1172 .writepages = generic_writepages, 1173 .direct_IO = blkdev_direct_IO, 1174}; 1175 1176const struct file_operations def_blk_fops = { 1177 .open = blkdev_open, 1178 .release = blkdev_close, 1179 .llseek = block_llseek, 1180 .read = generic_file_read, 1181 .write = blkdev_file_write, 1182 .aio_read = generic_file_aio_read, 1183 .aio_write = blkdev_file_aio_write, 1184 .mmap = generic_file_mmap, 1185 .fsync = block_fsync, 1186 .unlocked_ioctl = block_ioctl, 1187#ifdef CONFIG_COMPAT 1188 .compat_ioctl = compat_blkdev_ioctl, 1189#endif 1190 .readv = generic_file_readv, 1191 .writev = generic_file_write_nolock, 1192 .sendfile = generic_file_sendfile, 1193 .splice_read = generic_file_splice_read, 1194 .splice_write = generic_file_splice_write, 1195}; 1196 1197int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1198{ 1199 int res; 1200 mm_segment_t old_fs = get_fs(); 1201 set_fs(KERNEL_DS); 1202 res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg); 1203 set_fs(old_fs); 1204 return res; 1205} 1206 1207EXPORT_SYMBOL(ioctl_by_bdev); 1208 1209/** 1210 * lookup_bdev - lookup a struct block_device by name 1211 * 1212 * @path: special file representing the block device 1213 * 1214 * Get a reference to the blockdevice at @path in the current 1215 * namespace if possible and return it. Return ERR_PTR(error) 1216 * otherwise. 1217 */ 1218struct block_device *lookup_bdev(const char *path) 1219{ 1220 struct block_device *bdev; 1221 struct inode *inode; 1222 struct nameidata nd; 1223 int error; 1224 1225 if (!path || !*path) 1226 return ERR_PTR(-EINVAL); 1227 1228 error = path_lookup(path, LOOKUP_FOLLOW, &nd); 1229 if (error) 1230 return ERR_PTR(error); 1231 1232 inode = nd.dentry->d_inode; 1233 error = -ENOTBLK; 1234 if (!S_ISBLK(inode->i_mode)) 1235 goto fail; 1236 error = -EACCES; 1237 if (nd.mnt->mnt_flags & MNT_NODEV) 1238 goto fail; 1239 error = -ENOMEM; 1240 bdev = bd_acquire(inode); 1241 if (!bdev) 1242 goto fail; 1243out: 1244 path_release(&nd); 1245 return bdev; 1246fail: 1247 bdev = ERR_PTR(error); 1248 goto out; 1249} 1250 1251/** 1252 * open_bdev_excl - open a block device by name and set it up for use 1253 * 1254 * @path: special file representing the block device 1255 * @flags: %MS_RDONLY for opening read-only 1256 * @holder: owner for exclusion 1257 * 1258 * Open the blockdevice described by the special file at @path, claim it 1259 * for the @holder. 1260 */ 1261struct block_device *open_bdev_excl(const char *path, int flags, void *holder) 1262{ 1263 struct block_device *bdev; 1264 mode_t mode = FMODE_READ; 1265 int error = 0; 1266 1267 bdev = lookup_bdev(path); 1268 if (IS_ERR(bdev)) 1269 return bdev; 1270 1271 if (!(flags & MS_RDONLY)) 1272 mode |= FMODE_WRITE; 1273 error = blkdev_get(bdev, mode, 0); 1274 if (error) 1275 return ERR_PTR(error); 1276 error = -EACCES; 1277 if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) 1278 goto blkdev_put; 1279 error = bd_claim(bdev, holder); 1280 if (error) 1281 goto blkdev_put; 1282 1283 return bdev; 1284 1285blkdev_put: 1286 blkdev_put(bdev); 1287 return ERR_PTR(error); 1288} 1289 1290EXPORT_SYMBOL(open_bdev_excl); 1291 1292/** 1293 * close_bdev_excl - release a blockdevice openen by open_bdev_excl() 1294 * 1295 * @bdev: blockdevice to close 1296 * 1297 * This is the counterpart to open_bdev_excl(). 1298 */ 1299void close_bdev_excl(struct block_device *bdev) 1300{ 1301 bd_release(bdev); 1302 blkdev_put(bdev); 1303} 1304 1305EXPORT_SYMBOL(close_bdev_excl);