fs/block_dev.c at v2.6.30 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / block_dev.c
at v2.6.30 1556 lines 37 kB view raw
   1/*
   2 *  linux/fs/block_dev.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
   6 */
   7
   8#include <linux/init.h>
   9#include <linux/mm.h>
  10#include <linux/fcntl.h>
  11#include <linux/slab.h>
  12#include <linux/kmod.h>
  13#include <linux/major.h>
  14#include <linux/smp_lock.h>
  15#include <linux/device_cgroup.h>
  16#include <linux/highmem.h>
  17#include <linux/blkdev.h>
  18#include <linux/module.h>
  19#include <linux/blkpg.h>
  20#include <linux/buffer_head.h>
  21#include <linux/pagevec.h>
  22#include <linux/writeback.h>
  23#include <linux/mpage.h>
  24#include <linux/mount.h>
  25#include <linux/uio.h>
  26#include <linux/namei.h>
  27#include <linux/log2.h>
  28#include <asm/uaccess.h>
  29#include "internal.h"
  30
  31struct bdev_inode {
  32	struct block_device bdev;
  33	struct inode vfs_inode;
  34};
  35
  36static const struct address_space_operations def_blk_aops;
  37
  38static inline struct bdev_inode *BDEV_I(struct inode *inode)
  39{
  40	return container_of(inode, struct bdev_inode, vfs_inode);
  41}
  42
  43inline struct block_device *I_BDEV(struct inode *inode)
  44{
  45	return &BDEV_I(inode)->bdev;
  46}
  47
  48EXPORT_SYMBOL(I_BDEV);
  49
  50static sector_t max_block(struct block_device *bdev)
  51{
  52	sector_t retval = ~((sector_t)0);
  53	loff_t sz = i_size_read(bdev->bd_inode);
  54
  55	if (sz) {
  56		unsigned int size = block_size(bdev);
  57		unsigned int sizebits = blksize_bits(size);
  58		retval = (sz >> sizebits);
  59	}
  60	return retval;
  61}
  62
  63/* Kill _all_ buffers and pagecache , dirty or not.. */
  64static void kill_bdev(struct block_device *bdev)
  65{
  66	if (bdev->bd_inode->i_mapping->nrpages == 0)
  67		return;
  68	invalidate_bh_lrus();
  69	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
  70}	
  71
  72int set_blocksize(struct block_device *bdev, int size)
  73{
  74	/* Size must be a power of two, and between 512 and PAGE_SIZE */
  75	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
  76		return -EINVAL;
  77
  78	/* Size cannot be smaller than the size supported by the device */
  79	if (size < bdev_hardsect_size(bdev))
  80		return -EINVAL;
  81
  82	/* Don't change the size if it is same as current */
  83	if (bdev->bd_block_size != size) {
  84		sync_blockdev(bdev);
  85		bdev->bd_block_size = size;
  86		bdev->bd_inode->i_blkbits = blksize_bits(size);
  87		kill_bdev(bdev);
  88	}
  89	return 0;
  90}
  91
  92EXPORT_SYMBOL(set_blocksize);
  93
  94int sb_set_blocksize(struct super_block *sb, int size)
  95{
  96	if (set_blocksize(sb->s_bdev, size))
  97		return 0;
  98	/* If we get here, we know size is power of two
  99	 * and it's value is between 512 and PAGE_SIZE */
 100	sb->s_blocksize = size;
 101	sb->s_blocksize_bits = blksize_bits(size);
 102	return sb->s_blocksize;
 103}
 104
 105EXPORT_SYMBOL(sb_set_blocksize);
 106
 107int sb_min_blocksize(struct super_block *sb, int size)
 108{
 109	int minsize = bdev_hardsect_size(sb->s_bdev);
 110	if (size < minsize)
 111		size = minsize;
 112	return sb_set_blocksize(sb, size);
 113}
 114
 115EXPORT_SYMBOL(sb_min_blocksize);
 116
 117static int
 118blkdev_get_block(struct inode *inode, sector_t iblock,
 119		struct buffer_head *bh, int create)
 120{
 121	if (iblock >= max_block(I_BDEV(inode))) {
 122		if (create)
 123			return -EIO;
 124
 125		/*
 126		 * for reads, we're just trying to fill a partial page.
 127		 * return a hole, they will have to call get_block again
 128		 * before they can fill it, and they will get -EIO at that
 129		 * time
 130		 */
 131		return 0;
 132	}
 133	bh->b_bdev = I_BDEV(inode);
 134	bh->b_blocknr = iblock;
 135	set_buffer_mapped(bh);
 136	return 0;
 137}
 138
 139static int
 140blkdev_get_blocks(struct inode *inode, sector_t iblock,
 141		struct buffer_head *bh, int create)
 142{
 143	sector_t end_block = max_block(I_BDEV(inode));
 144	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
 145
 146	if ((iblock + max_blocks) > end_block) {
 147		max_blocks = end_block - iblock;
 148		if ((long)max_blocks <= 0) {
 149			if (create)
 150				return -EIO;	/* write fully beyond EOF */
 151			/*
 152			 * It is a read which is fully beyond EOF.  We return
 153			 * a !buffer_mapped buffer
 154			 */
 155			max_blocks = 0;
 156		}
 157	}
 158
 159	bh->b_bdev = I_BDEV(inode);
 160	bh->b_blocknr = iblock;
 161	bh->b_size = max_blocks << inode->i_blkbits;
 162	if (max_blocks)
 163		set_buffer_mapped(bh);
 164	return 0;
 165}
 166
 167static ssize_t
 168blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 169			loff_t offset, unsigned long nr_segs)
 170{
 171	struct file *file = iocb->ki_filp;
 172	struct inode *inode = file->f_mapping->host;
 173
 174	return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
 175				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 176}
 177
 178/*
 179 * Write out and wait upon all the dirty data associated with a block
 180 * device via its mapping.  Does not take the superblock lock.
 181 */
 182int sync_blockdev(struct block_device *bdev)
 183{
 184	int ret = 0;
 185
 186	if (bdev)
 187		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 188	return ret;
 189}
 190EXPORT_SYMBOL(sync_blockdev);
 191
 192/*
 193 * Write out and wait upon all dirty data associated with this
 194 * device.   Filesystem data as well as the underlying block
 195 * device.  Takes the superblock lock.
 196 */
 197int fsync_bdev(struct block_device *bdev)
 198{
 199	struct super_block *sb = get_super(bdev);
 200	if (sb) {
 201		int res = fsync_super(sb);
 202		drop_super(sb);
 203		return res;
 204	}
 205	return sync_blockdev(bdev);
 206}
 207EXPORT_SYMBOL(fsync_bdev);
 208
 209/**
 210 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 211 * @bdev:	blockdevice to lock
 212 *
 213 * This takes the block device bd_mount_sem to make sure no new mounts
 214 * happen on bdev until thaw_bdev() is called.
 215 * If a superblock is found on this device, we take the s_umount semaphore
 216 * on it to make sure nobody unmounts until the snapshot creation is done.
 217 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 218 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 219 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
 220 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
 221 * actually.
 222 */
 223struct super_block *freeze_bdev(struct block_device *bdev)
 224{
 225	struct super_block *sb;
 226	int error = 0;
 227
 228	mutex_lock(&bdev->bd_fsfreeze_mutex);
 229	if (bdev->bd_fsfreeze_count > 0) {
 230		bdev->bd_fsfreeze_count++;
 231		sb = get_super(bdev);
 232		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 233		return sb;
 234	}
 235	bdev->bd_fsfreeze_count++;
 236
 237	down(&bdev->bd_mount_sem);
 238	sb = get_super(bdev);
 239	if (sb && !(sb->s_flags & MS_RDONLY)) {
 240		sb->s_frozen = SB_FREEZE_WRITE;
 241		smp_wmb();
 242
 243		__fsync_super(sb);
 244
 245		sb->s_frozen = SB_FREEZE_TRANS;
 246		smp_wmb();
 247
 248		sync_blockdev(sb->s_bdev);
 249
 250		if (sb->s_op->freeze_fs) {
 251			error = sb->s_op->freeze_fs(sb);
 252			if (error) {
 253				printk(KERN_ERR
 254					"VFS:Filesystem freeze failed\n");
 255				sb->s_frozen = SB_UNFROZEN;
 256				drop_super(sb);
 257				up(&bdev->bd_mount_sem);
 258				bdev->bd_fsfreeze_count--;
 259				mutex_unlock(&bdev->bd_fsfreeze_mutex);
 260				return ERR_PTR(error);
 261			}
 262		}
 263	}
 264
 265	sync_blockdev(bdev);
 266	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 267
 268	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 269}
 270EXPORT_SYMBOL(freeze_bdev);
 271
 272/**
 273 * thaw_bdev  -- unlock filesystem
 274 * @bdev:	blockdevice to unlock
 275 * @sb:		associated superblock
 276 *
 277 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 278 */
 279int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 280{
 281	int error = 0;
 282
 283	mutex_lock(&bdev->bd_fsfreeze_mutex);
 284	if (!bdev->bd_fsfreeze_count) {
 285		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 286		return -EINVAL;
 287	}
 288
 289	bdev->bd_fsfreeze_count--;
 290	if (bdev->bd_fsfreeze_count > 0) {
 291		if (sb)
 292			drop_super(sb);
 293		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 294		return 0;
 295	}
 296
 297	if (sb) {
 298		BUG_ON(sb->s_bdev != bdev);
 299		if (!(sb->s_flags & MS_RDONLY)) {
 300			if (sb->s_op->unfreeze_fs) {
 301				error = sb->s_op->unfreeze_fs(sb);
 302				if (error) {
 303					printk(KERN_ERR
 304						"VFS:Filesystem thaw failed\n");
 305					sb->s_frozen = SB_FREEZE_TRANS;
 306					bdev->bd_fsfreeze_count++;
 307					mutex_unlock(&bdev->bd_fsfreeze_mutex);
 308					return error;
 309				}
 310			}
 311			sb->s_frozen = SB_UNFROZEN;
 312			smp_wmb();
 313			wake_up(&sb->s_wait_unfrozen);
 314		}
 315		drop_super(sb);
 316	}
 317
 318	up(&bdev->bd_mount_sem);
 319	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 320	return 0;
 321}
 322EXPORT_SYMBOL(thaw_bdev);
 323
 324static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 325{
 326	return block_write_full_page(page, blkdev_get_block, wbc);
 327}
 328
 329static int blkdev_readpage(struct file * file, struct page * page)
 330{
 331	return block_read_full_page(page, blkdev_get_block);
 332}
 333
 334static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 335			loff_t pos, unsigned len, unsigned flags,
 336			struct page **pagep, void **fsdata)
 337{
 338	*pagep = NULL;
 339	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 340				blkdev_get_block);
 341}
 342
 343static int blkdev_write_end(struct file *file, struct address_space *mapping,
 344			loff_t pos, unsigned len, unsigned copied,
 345			struct page *page, void *fsdata)
 346{
 347	int ret;
 348	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 349
 350	unlock_page(page);
 351	page_cache_release(page);
 352
 353	return ret;
 354}
 355
 356/*
 357 * private llseek:
 358 * for a block special file file->f_path.dentry->d_inode->i_size is zero
 359 * so we compute the size by hand (just as in block_read/write above)
 360 */
 361static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 362{
 363	struct inode *bd_inode = file->f_mapping->host;
 364	loff_t size;
 365	loff_t retval;
 366
 367	mutex_lock(&bd_inode->i_mutex);
 368	size = i_size_read(bd_inode);
 369
 370	switch (origin) {
 371		case 2:
 372			offset += size;
 373			break;
 374		case 1:
 375			offset += file->f_pos;
 376	}
 377	retval = -EINVAL;
 378	if (offset >= 0 && offset <= size) {
 379		if (offset != file->f_pos) {
 380			file->f_pos = offset;
 381		}
 382		retval = offset;
 383	}
 384	mutex_unlock(&bd_inode->i_mutex);
 385	return retval;
 386}
 387	
 388/*
 389 *	Filp is never NULL; the only case when ->fsync() is called with
 390 *	NULL first argument is nfsd_sync_dir() and that's not a directory.
 391 */
 392 
 393static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 394{
 395	return sync_blockdev(I_BDEV(filp->f_mapping->host));
 396}
 397
 398/*
 399 * pseudo-fs
 400 */
 401
 402static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
 403static struct kmem_cache * bdev_cachep __read_mostly;
 404
 405static struct inode *bdev_alloc_inode(struct super_block *sb)
 406{
 407	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 408	if (!ei)
 409		return NULL;
 410	return &ei->vfs_inode;
 411}
 412
 413static void bdev_destroy_inode(struct inode *inode)
 414{
 415	struct bdev_inode *bdi = BDEV_I(inode);
 416
 417	bdi->bdev.bd_inode_backing_dev_info = NULL;
 418	kmem_cache_free(bdev_cachep, bdi);
 419}
 420
 421static void init_once(void *foo)
 422{
 423	struct bdev_inode *ei = (struct bdev_inode *) foo;
 424	struct block_device *bdev = &ei->bdev;
 425
 426	memset(bdev, 0, sizeof(*bdev));
 427	mutex_init(&bdev->bd_mutex);
 428	sema_init(&bdev->bd_mount_sem, 1);
 429	INIT_LIST_HEAD(&bdev->bd_inodes);
 430	INIT_LIST_HEAD(&bdev->bd_list);
 431#ifdef CONFIG_SYSFS
 432	INIT_LIST_HEAD(&bdev->bd_holder_list);
 433#endif
 434	inode_init_once(&ei->vfs_inode);
 435	/* Initialize mutex for freeze. */
 436	mutex_init(&bdev->bd_fsfreeze_mutex);
 437}
 438
 439static inline void __bd_forget(struct inode *inode)
 440{
 441	list_del_init(&inode->i_devices);
 442	inode->i_bdev = NULL;
 443	inode->i_mapping = &inode->i_data;
 444}
 445
 446static void bdev_clear_inode(struct inode *inode)
 447{
 448	struct block_device *bdev = &BDEV_I(inode)->bdev;
 449	struct list_head *p;
 450	spin_lock(&bdev_lock);
 451	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 452		__bd_forget(list_entry(p, struct inode, i_devices));
 453	}
 454	list_del_init(&bdev->bd_list);
 455	spin_unlock(&bdev_lock);
 456}
 457
 458static const struct super_operations bdev_sops = {
 459	.statfs = simple_statfs,
 460	.alloc_inode = bdev_alloc_inode,
 461	.destroy_inode = bdev_destroy_inode,
 462	.drop_inode = generic_delete_inode,
 463	.clear_inode = bdev_clear_inode,
 464};
 465
 466static int bd_get_sb(struct file_system_type *fs_type,
 467	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 468{
 469	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 470}
 471
 472static struct file_system_type bd_type = {
 473	.name		= "bdev",
 474	.get_sb		= bd_get_sb,
 475	.kill_sb	= kill_anon_super,
 476};
 477
 478struct super_block *blockdev_superblock __read_mostly;
 479
 480void __init bdev_cache_init(void)
 481{
 482	int err;
 483	struct vfsmount *bd_mnt;
 484
 485	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 486			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 487				SLAB_MEM_SPREAD|SLAB_PANIC),
 488			init_once);
 489	err = register_filesystem(&bd_type);
 490	if (err)
 491		panic("Cannot register bdev pseudo-fs");
 492	bd_mnt = kern_mount(&bd_type);
 493	if (IS_ERR(bd_mnt))
 494		panic("Cannot create bdev pseudo-fs");
 495	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 496}
 497
 498/*
 499 * Most likely _very_ bad one - but then it's hardly critical for small
 500 * /dev and can be fixed when somebody will need really large one.
 501 * Keep in mind that it will be fed through icache hash function too.
 502 */
 503static inline unsigned long hash(dev_t dev)
 504{
 505	return MAJOR(dev)+MINOR(dev);
 506}
 507
 508static int bdev_test(struct inode *inode, void *data)
 509{
 510	return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
 511}
 512
 513static int bdev_set(struct inode *inode, void *data)
 514{
 515	BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
 516	return 0;
 517}
 518
 519static LIST_HEAD(all_bdevs);
 520
 521struct block_device *bdget(dev_t dev)
 522{
 523	struct block_device *bdev;
 524	struct inode *inode;
 525
 526	inode = iget5_locked(blockdev_superblock, hash(dev),
 527			bdev_test, bdev_set, &dev);
 528
 529	if (!inode)
 530		return NULL;
 531
 532	bdev = &BDEV_I(inode)->bdev;
 533
 534	if (inode->i_state & I_NEW) {
 535		bdev->bd_contains = NULL;
 536		bdev->bd_inode = inode;
 537		bdev->bd_block_size = (1 << inode->i_blkbits);
 538		bdev->bd_part_count = 0;
 539		bdev->bd_invalidated = 0;
 540		inode->i_mode = S_IFBLK;
 541		inode->i_rdev = dev;
 542		inode->i_bdev = bdev;
 543		inode->i_data.a_ops = &def_blk_aops;
 544		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 545		inode->i_data.backing_dev_info = &default_backing_dev_info;
 546		spin_lock(&bdev_lock);
 547		list_add(&bdev->bd_list, &all_bdevs);
 548		spin_unlock(&bdev_lock);
 549		unlock_new_inode(inode);
 550	}
 551	return bdev;
 552}
 553
 554EXPORT_SYMBOL(bdget);
 555
 556long nr_blockdev_pages(void)
 557{
 558	struct block_device *bdev;
 559	long ret = 0;
 560	spin_lock(&bdev_lock);
 561	list_for_each_entry(bdev, &all_bdevs, bd_list) {
 562		ret += bdev->bd_inode->i_mapping->nrpages;
 563	}
 564	spin_unlock(&bdev_lock);
 565	return ret;
 566}
 567
 568void bdput(struct block_device *bdev)
 569{
 570	iput(bdev->bd_inode);
 571}
 572
 573EXPORT_SYMBOL(bdput);
 574 
 575static struct block_device *bd_acquire(struct inode *inode)
 576{
 577	struct block_device *bdev;
 578
 579	spin_lock(&bdev_lock);
 580	bdev = inode->i_bdev;
 581	if (bdev) {
 582		atomic_inc(&bdev->bd_inode->i_count);
 583		spin_unlock(&bdev_lock);
 584		return bdev;
 585	}
 586	spin_unlock(&bdev_lock);
 587
 588	bdev = bdget(inode->i_rdev);
 589	if (bdev) {
 590		spin_lock(&bdev_lock);
 591		if (!inode->i_bdev) {
 592			/*
 593			 * We take an additional bd_inode->i_count for inode,
 594			 * and it's released in clear_inode() of inode.
 595			 * So, we can access it via ->i_mapping always
 596			 * without igrab().
 597			 */
 598			atomic_inc(&bdev->bd_inode->i_count);
 599			inode->i_bdev = bdev;
 600			inode->i_mapping = bdev->bd_inode->i_mapping;
 601			list_add(&inode->i_devices, &bdev->bd_inodes);
 602		}
 603		spin_unlock(&bdev_lock);
 604	}
 605	return bdev;
 606}
 607
 608/* Call when you free inode */
 609
 610void bd_forget(struct inode *inode)
 611{
 612	struct block_device *bdev = NULL;
 613
 614	spin_lock(&bdev_lock);
 615	if (inode->i_bdev) {
 616		if (!sb_is_blkdev_sb(inode->i_sb))
 617			bdev = inode->i_bdev;
 618		__bd_forget(inode);
 619	}
 620	spin_unlock(&bdev_lock);
 621
 622	if (bdev)
 623		iput(bdev->bd_inode);
 624}
 625
 626int bd_claim(struct block_device *bdev, void *holder)
 627{
 628	int res;
 629	spin_lock(&bdev_lock);
 630
 631	/* first decide result */
 632	if (bdev->bd_holder == holder)
 633		res = 0;	 /* already a holder */
 634	else if (bdev->bd_holder != NULL)
 635		res = -EBUSY; 	 /* held by someone else */
 636	else if (bdev->bd_contains == bdev)
 637		res = 0;  	 /* is a whole device which isn't held */
 638
 639	else if (bdev->bd_contains->bd_holder == bd_claim)
 640		res = 0; 	 /* is a partition of a device that is being partitioned */
 641	else if (bdev->bd_contains->bd_holder != NULL)
 642		res = -EBUSY;	 /* is a partition of a held device */
 643	else
 644		res = 0;	 /* is a partition of an un-held device */
 645
 646	/* now impose change */
 647	if (res==0) {
 648		/* note that for a whole device bd_holders
 649		 * will be incremented twice, and bd_holder will
 650		 * be set to bd_claim before being set to holder
 651		 */
 652		bdev->bd_contains->bd_holders ++;
 653		bdev->bd_contains->bd_holder = bd_claim;
 654		bdev->bd_holders++;
 655		bdev->bd_holder = holder;
 656	}
 657	spin_unlock(&bdev_lock);
 658	return res;
 659}
 660
 661EXPORT_SYMBOL(bd_claim);
 662
 663void bd_release(struct block_device *bdev)
 664{
 665	spin_lock(&bdev_lock);
 666	if (!--bdev->bd_contains->bd_holders)
 667		bdev->bd_contains->bd_holder = NULL;
 668	if (!--bdev->bd_holders)
 669		bdev->bd_holder = NULL;
 670	spin_unlock(&bdev_lock);
 671}
 672
 673EXPORT_SYMBOL(bd_release);
 674
 675#ifdef CONFIG_SYSFS
 676/*
 677 * Functions for bd_claim_by_kobject / bd_release_from_kobject
 678 *
 679 *     If a kobject is passed to bd_claim_by_kobject()
 680 *     and the kobject has a parent directory,
 681 *     following symlinks are created:
 682 *        o from the kobject to the claimed bdev
 683 *        o from "holders" directory of the bdev to the parent of the kobject
 684 *     bd_release_from_kobject() removes these symlinks.
 685 *
 686 *     Example:
 687 *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
 688 *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
 689 *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
 690 *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 691 */
 692
 693static int add_symlink(struct kobject *from, struct kobject *to)
 694{
 695	if (!from || !to)
 696		return 0;
 697	return sysfs_create_link(from, to, kobject_name(to));
 698}
 699
 700static void del_symlink(struct kobject *from, struct kobject *to)
 701{
 702	if (!from || !to)
 703		return;
 704	sysfs_remove_link(from, kobject_name(to));
 705}
 706
 707/*
 708 * 'struct bd_holder' contains pointers to kobjects symlinked by
 709 * bd_claim_by_kobject.
 710 * It's connected to bd_holder_list which is protected by bdev->bd_sem.
 711 */
 712struct bd_holder {
 713	struct list_head list;	/* chain of holders of the bdev */
 714	int count;		/* references from the holder */
 715	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
 716	struct kobject *hdev;	/* e.g. "/block/dm-0" */
 717	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
 718	struct kobject *sdev;	/* e.g. "/block/sda" */
 719};
 720
 721/*
 722 * Get references of related kobjects at once.
 723 * Returns 1 on success. 0 on failure.
 724 *
 725 * Should call bd_holder_release_dirs() after successful use.
 726 */
 727static int bd_holder_grab_dirs(struct block_device *bdev,
 728			struct bd_holder *bo)
 729{
 730	if (!bdev || !bo)
 731		return 0;
 732
 733	bo->sdir = kobject_get(bo->sdir);
 734	if (!bo->sdir)
 735		return 0;
 736
 737	bo->hdev = kobject_get(bo->sdir->parent);
 738	if (!bo->hdev)
 739		goto fail_put_sdir;
 740
 741	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 742	if (!bo->sdev)
 743		goto fail_put_hdev;
 744
 745	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
 746	if (!bo->hdir)
 747		goto fail_put_sdev;
 748
 749	return 1;
 750
 751fail_put_sdev:
 752	kobject_put(bo->sdev);
 753fail_put_hdev:
 754	kobject_put(bo->hdev);
 755fail_put_sdir:
 756	kobject_put(bo->sdir);
 757
 758	return 0;
 759}
 760
 761/* Put references of related kobjects at once. */
 762static void bd_holder_release_dirs(struct bd_holder *bo)
 763{
 764	kobject_put(bo->hdir);
 765	kobject_put(bo->sdev);
 766	kobject_put(bo->hdev);
 767	kobject_put(bo->sdir);
 768}
 769
 770static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
 771{
 772	struct bd_holder *bo;
 773
 774	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
 775	if (!bo)
 776		return NULL;
 777
 778	bo->count = 1;
 779	bo->sdir = kobj;
 780
 781	return bo;
 782}
 783
 784static void free_bd_holder(struct bd_holder *bo)
 785{
 786	kfree(bo);
 787}
 788
 789/**
 790 * find_bd_holder - find matching struct bd_holder from the block device
 791 *
 792 * @bdev:	struct block device to be searched
 793 * @bo:		target struct bd_holder
 794 *
 795 * Returns matching entry with @bo in @bdev->bd_holder_list.
 796 * If found, increment the reference count and return the pointer.
 797 * If not found, returns NULL.
 798 */
 799static struct bd_holder *find_bd_holder(struct block_device *bdev,
 800					struct bd_holder *bo)
 801{
 802	struct bd_holder *tmp;
 803
 804	list_for_each_entry(tmp, &bdev->bd_holder_list, list)
 805		if (tmp->sdir == bo->sdir) {
 806			tmp->count++;
 807			return tmp;
 808		}
 809
 810	return NULL;
 811}
 812
 813/**
 814 * add_bd_holder - create sysfs symlinks for bd_claim() relationship
 815 *
 816 * @bdev:	block device to be bd_claimed
 817 * @bo:		preallocated and initialized by alloc_bd_holder()
 818 *
 819 * Add @bo to @bdev->bd_holder_list, create symlinks.
 820 *
 821 * Returns 0 if symlinks are created.
 822 * Returns -ve if something fails.
 823 */
 824static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
 825{
 826	int err;
 827
 828	if (!bo)
 829		return -EINVAL;
 830
 831	if (!bd_holder_grab_dirs(bdev, bo))
 832		return -EBUSY;
 833
 834	err = add_symlink(bo->sdir, bo->sdev);
 835	if (err)
 836		return err;
 837
 838	err = add_symlink(bo->hdir, bo->hdev);
 839	if (err) {
 840		del_symlink(bo->sdir, bo->sdev);
 841		return err;
 842	}
 843
 844	list_add_tail(&bo->list, &bdev->bd_holder_list);
 845	return 0;
 846}
 847
 848/**
 849 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
 850 *
 851 * @bdev:	block device to be bd_claimed
 852 * @kobj:	holder's kobject
 853 *
 854 * If there is matching entry with @kobj in @bdev->bd_holder_list
 855 * and no other bd_claim() from the same kobject,
 856 * remove the struct bd_holder from the list, delete symlinks for it.
 857 *
 858 * Returns a pointer to the struct bd_holder when it's removed from the list
 859 * and ready to be freed.
 860 * Returns NULL if matching claim isn't found or there is other bd_claim()
 861 * by the same kobject.
 862 */
 863static struct bd_holder *del_bd_holder(struct block_device *bdev,
 864					struct kobject *kobj)
 865{
 866	struct bd_holder *bo;
 867
 868	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
 869		if (bo->sdir == kobj) {
 870			bo->count--;
 871			BUG_ON(bo->count < 0);
 872			if (!bo->count) {
 873				list_del(&bo->list);
 874				del_symlink(bo->sdir, bo->sdev);
 875				del_symlink(bo->hdir, bo->hdev);
 876				bd_holder_release_dirs(bo);
 877				return bo;
 878			}
 879			break;
 880		}
 881	}
 882
 883	return NULL;
 884}
 885
 886/**
 887 * bd_claim_by_kobject - bd_claim() with additional kobject signature
 888 *
 889 * @bdev:	block device to be claimed
 890 * @holder:	holder's signature
 891 * @kobj:	holder's kobject
 892 *
 893 * Do bd_claim() and if it succeeds, create sysfs symlinks between
 894 * the bdev and the holder's kobject.
 895 * Use bd_release_from_kobject() when relesing the claimed bdev.
 896 *
 897 * Returns 0 on success. (same as bd_claim())
 898 * Returns errno on failure.
 899 */
 900static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
 901				struct kobject *kobj)
 902{
 903	int err;
 904	struct bd_holder *bo, *found;
 905
 906	if (!kobj)
 907		return -EINVAL;
 908
 909	bo = alloc_bd_holder(kobj);
 910	if (!bo)
 911		return -ENOMEM;
 912
 913	mutex_lock(&bdev->bd_mutex);
 914
 915	err = bd_claim(bdev, holder);
 916	if (err)
 917		goto fail;
 918
 919	found = find_bd_holder(bdev, bo);
 920	if (found)
 921		goto fail;
 922
 923	err = add_bd_holder(bdev, bo);
 924	if (err)
 925		bd_release(bdev);
 926	else
 927		bo = NULL;
 928fail:
 929	mutex_unlock(&bdev->bd_mutex);
 930	free_bd_holder(bo);
 931	return err;
 932}
 933
 934/**
 935 * bd_release_from_kobject - bd_release() with additional kobject signature
 936 *
 937 * @bdev:	block device to be released
 938 * @kobj:	holder's kobject
 939 *
 940 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
 941 */
 942static void bd_release_from_kobject(struct block_device *bdev,
 943					struct kobject *kobj)
 944{
 945	if (!kobj)
 946		return;
 947
 948	mutex_lock(&bdev->bd_mutex);
 949	bd_release(bdev);
 950	free_bd_holder(del_bd_holder(bdev, kobj));
 951	mutex_unlock(&bdev->bd_mutex);
 952}
 953
 954/**
 955 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
 956 *
 957 * @bdev:	block device to be claimed
 958 * @holder:	holder's signature
 959 * @disk:	holder's gendisk
 960 *
 961 * Call bd_claim_by_kobject() with getting @disk->slave_dir.
 962 */
 963int bd_claim_by_disk(struct block_device *bdev, void *holder,
 964			struct gendisk *disk)
 965{
 966	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
 967}
 968EXPORT_SYMBOL_GPL(bd_claim_by_disk);
 969
 970/**
 971 * bd_release_from_disk - wrapper function for bd_release_from_kobject()
 972 *
 973 * @bdev:	block device to be claimed
 974 * @disk:	holder's gendisk
 975 *
 976 * Call bd_release_from_kobject() and put @disk->slave_dir.
 977 */
 978void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
 979{
 980	bd_release_from_kobject(bdev, disk->slave_dir);
 981	kobject_put(disk->slave_dir);
 982}
 983EXPORT_SYMBOL_GPL(bd_release_from_disk);
 984#endif
 985
 986/*
 987 * Tries to open block device by device number.  Use it ONLY if you
 988 * really do not have anything better - i.e. when you are behind a
 989 * truly sucky interface and all you are given is a device number.  _Never_
 990 * to be used for internal purposes.  If you ever need it - reconsider
 991 * your API.
 992 */
 993struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 994{
 995	struct block_device *bdev = bdget(dev);
 996	int err = -ENOMEM;
 997	if (bdev)
 998		err = blkdev_get(bdev, mode);
 999	return err ? ERR_PTR(err) : bdev;
1000}
1001
1002EXPORT_SYMBOL(open_by_devnum);
1003
1004/**
1005 * flush_disk - invalidates all buffer-cache entries on a disk
1006 *
1007 * @bdev:      struct block device to be flushed
1008 *
1009 * Invalidates all buffer-cache entries on a disk. It should be called
1010 * when a disk has been changed -- either by a media change or online
1011 * resize.
1012 */
1013static void flush_disk(struct block_device *bdev)
1014{
1015	if (__invalidate_device(bdev)) {
1016		char name[BDEVNAME_SIZE] = "";
1017
1018		if (bdev->bd_disk)
1019			disk_name(bdev->bd_disk, 0, name);
1020		printk(KERN_WARNING "VFS: busy inodes on changed media or "
1021		       "resized disk %s\n", name);
1022	}
1023
1024	if (!bdev->bd_disk)
1025		return;
1026	if (disk_partitionable(bdev->bd_disk))
1027		bdev->bd_invalidated = 1;
1028}
1029
1030/**
1031 * check_disk_size_change - checks for disk size change and adjusts bdev size.
1032 * @disk: struct gendisk to check
1033 * @bdev: struct bdev to adjust.
1034 *
1035 * This routine checks to see if the bdev size does not match the disk size
1036 * and adjusts it if it differs.
1037 */
1038void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
1039{
1040	loff_t disk_size, bdev_size;
1041
1042	disk_size = (loff_t)get_capacity(disk) << 9;
1043	bdev_size = i_size_read(bdev->bd_inode);
1044	if (disk_size != bdev_size) {
1045		char name[BDEVNAME_SIZE];
1046
1047		disk_name(disk, 0, name);
1048		printk(KERN_INFO
1049		       "%s: detected capacity change from %lld to %lld\n",
1050		       name, bdev_size, disk_size);
1051		i_size_write(bdev->bd_inode, disk_size);
1052		flush_disk(bdev);
1053	}
1054}
1055EXPORT_SYMBOL(check_disk_size_change);
1056
1057/**
1058 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
1059 * @disk: struct gendisk to be revalidated
1060 *
1061 * This routine is a wrapper for lower-level driver's revalidate_disk
1062 * call-backs.  It is used to do common pre and post operations needed
1063 * for all revalidate_disk operations.
1064 */
1065int revalidate_disk(struct gendisk *disk)
1066{
1067	struct block_device *bdev;
1068	int ret = 0;
1069
1070	if (disk->fops->revalidate_disk)
1071		ret = disk->fops->revalidate_disk(disk);
1072
1073	bdev = bdget_disk(disk, 0);
1074	if (!bdev)
1075		return ret;
1076
1077	mutex_lock(&bdev->bd_mutex);
1078	check_disk_size_change(disk, bdev);
1079	mutex_unlock(&bdev->bd_mutex);
1080	bdput(bdev);
1081	return ret;
1082}
1083EXPORT_SYMBOL(revalidate_disk);
1084
1085/*
1086 * This routine checks whether a removable media has been changed,
1087 * and invalidates all buffer-cache-entries in that case. This
1088 * is a relatively slow routine, so we have to try to minimize using
1089 * it. Thus it is called only upon a 'mount' or 'open'. This
1090 * is the best way of combining speed and utility, I think.
1091 * People changing diskettes in the middle of an operation deserve
1092 * to lose :-)
1093 */
1094int check_disk_change(struct block_device *bdev)
1095{
1096	struct gendisk *disk = bdev->bd_disk;
1097	struct block_device_operations * bdops = disk->fops;
1098
1099	if (!bdops->media_changed)
1100		return 0;
1101	if (!bdops->media_changed(bdev->bd_disk))
1102		return 0;
1103
1104	flush_disk(bdev);
1105	if (bdops->revalidate_disk)
1106		bdops->revalidate_disk(bdev->bd_disk);
1107	return 1;
1108}
1109
1110EXPORT_SYMBOL(check_disk_change);
1111
1112void bd_set_size(struct block_device *bdev, loff_t size)
1113{
1114	unsigned bsize = bdev_hardsect_size(bdev);
1115
1116	bdev->bd_inode->i_size = size;
1117	while (bsize < PAGE_CACHE_SIZE) {
1118		if (size & bsize)
1119			break;
1120		bsize <<= 1;
1121	}
1122	bdev->bd_block_size = bsize;
1123	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
1124}
1125EXPORT_SYMBOL(bd_set_size);
1126
1127static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1128
1129/*
1130 * bd_mutex locking:
1131 *
1132 *  mutex_lock(part->bd_mutex)
1133 *    mutex_lock_nested(whole->bd_mutex, 1)
1134 */
1135
1136static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1137{
1138	struct gendisk *disk;
1139	int ret;
1140	int partno;
1141	int perm = 0;
1142
1143	if (mode & FMODE_READ)
1144		perm |= MAY_READ;
1145	if (mode & FMODE_WRITE)
1146		perm |= MAY_WRITE;
1147	/*
1148	 * hooks: /n/, see "layering violations".
1149	 */
1150	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1151	if (ret != 0) {
1152		bdput(bdev);
1153		return ret;
1154	}
1155
1156	lock_kernel();
1157 restart:
1158
1159	ret = -ENXIO;
1160	disk = get_gendisk(bdev->bd_dev, &partno);
1161	if (!disk)
1162		goto out_unlock_kernel;
1163
1164	mutex_lock_nested(&bdev->bd_mutex, for_part);
1165	if (!bdev->bd_openers) {
1166		bdev->bd_disk = disk;
1167		bdev->bd_contains = bdev;
1168		if (!partno) {
1169			struct backing_dev_info *bdi;
1170
1171			ret = -ENXIO;
1172			bdev->bd_part = disk_get_part(disk, partno);
1173			if (!bdev->bd_part)
1174				goto out_clear;
1175
1176			if (disk->fops->open) {
1177				ret = disk->fops->open(bdev, mode);
1178				if (ret == -ERESTARTSYS) {
1179					/* Lost a race with 'disk' being
1180					 * deleted, try again.
1181					 * See md.c
1182					 */
1183					disk_put_part(bdev->bd_part);
1184					bdev->bd_part = NULL;
1185					module_put(disk->fops->owner);
1186					put_disk(disk);
1187					bdev->bd_disk = NULL;
1188					mutex_unlock(&bdev->bd_mutex);
1189					goto restart;
1190				}
1191				if (ret)
1192					goto out_clear;
1193			}
1194			if (!bdev->bd_openers) {
1195				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1196				bdi = blk_get_backing_dev_info(bdev);
1197				if (bdi == NULL)
1198					bdi = &default_backing_dev_info;
1199				bdev->bd_inode->i_data.backing_dev_info = bdi;
1200			}
1201			if (bdev->bd_invalidated)
1202				rescan_partitions(disk, bdev);
1203		} else {
1204			struct block_device *whole;
1205			whole = bdget_disk(disk, 0);
1206			ret = -ENOMEM;
1207			if (!whole)
1208				goto out_clear;
1209			BUG_ON(for_part);
1210			ret = __blkdev_get(whole, mode, 1);
1211			if (ret)
1212				goto out_clear;
1213			bdev->bd_contains = whole;
1214			bdev->bd_inode->i_data.backing_dev_info =
1215			   whole->bd_inode->i_data.backing_dev_info;
1216			bdev->bd_part = disk_get_part(disk, partno);
1217			if (!(disk->flags & GENHD_FL_UP) ||
1218			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
1219				ret = -ENXIO;
1220				goto out_clear;
1221			}
1222			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1223		}
1224	} else {
1225		put_disk(disk);
1226		module_put(disk->fops->owner);
1227		disk = NULL;
1228		if (bdev->bd_contains == bdev) {
1229			if (bdev->bd_disk->fops->open) {
1230				ret = bdev->bd_disk->fops->open(bdev, mode);
1231				if (ret)
1232					goto out_unlock_bdev;
1233			}
1234			if (bdev->bd_invalidated)
1235				rescan_partitions(bdev->bd_disk, bdev);
1236		}
1237	}
1238	bdev->bd_openers++;
1239	if (for_part)
1240		bdev->bd_part_count++;
1241	mutex_unlock(&bdev->bd_mutex);
1242	unlock_kernel();
1243	return 0;
1244
1245 out_clear:
1246	disk_put_part(bdev->bd_part);
1247	bdev->bd_disk = NULL;
1248	bdev->bd_part = NULL;
1249	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1250	if (bdev != bdev->bd_contains)
1251		__blkdev_put(bdev->bd_contains, mode, 1);
1252	bdev->bd_contains = NULL;
1253 out_unlock_bdev:
1254	mutex_unlock(&bdev->bd_mutex);
1255 out_unlock_kernel:
1256	unlock_kernel();
1257
1258	if (disk)
1259		module_put(disk->fops->owner);
1260	put_disk(disk);
1261	bdput(bdev);
1262
1263	return ret;
1264}
1265
1266int blkdev_get(struct block_device *bdev, fmode_t mode)
1267{
1268	return __blkdev_get(bdev, mode, 0);
1269}
1270EXPORT_SYMBOL(blkdev_get);
1271
1272static int blkdev_open(struct inode * inode, struct file * filp)
1273{
1274	struct block_device *bdev;
1275	int res;
1276
1277	/*
1278	 * Preserve backwards compatibility and allow large file access
1279	 * even if userspace doesn't ask for it explicitly. Some mkfs
1280	 * binary needs it. We might want to drop this workaround
1281	 * during an unstable branch.
1282	 */
1283	filp->f_flags |= O_LARGEFILE;
1284
1285	if (filp->f_flags & O_NDELAY)
1286		filp->f_mode |= FMODE_NDELAY;
1287	if (filp->f_flags & O_EXCL)
1288		filp->f_mode |= FMODE_EXCL;
1289	if ((filp->f_flags & O_ACCMODE) == 3)
1290		filp->f_mode |= FMODE_WRITE_IOCTL;
1291
1292	bdev = bd_acquire(inode);
1293	if (bdev == NULL)
1294		return -ENOMEM;
1295
1296	filp->f_mapping = bdev->bd_inode->i_mapping;
1297
1298	res = blkdev_get(bdev, filp->f_mode);
1299	if (res)
1300		return res;
1301
1302	if (filp->f_mode & FMODE_EXCL) {
1303		res = bd_claim(bdev, filp);
1304		if (res)
1305			goto out_blkdev_put;
1306	}
1307
1308	return 0;
1309
1310 out_blkdev_put:
1311	blkdev_put(bdev, filp->f_mode);
1312	return res;
1313}
1314
1315static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1316{
1317	int ret = 0;
1318	struct gendisk *disk = bdev->bd_disk;
1319	struct block_device *victim = NULL;
1320
1321	mutex_lock_nested(&bdev->bd_mutex, for_part);
1322	lock_kernel();
1323	if (for_part)
1324		bdev->bd_part_count--;
1325
1326	if (!--bdev->bd_openers) {
1327		sync_blockdev(bdev);
1328		kill_bdev(bdev);
1329	}
1330	if (bdev->bd_contains == bdev) {
1331		if (disk->fops->release)
1332			ret = disk->fops->release(disk, mode);
1333	}
1334	if (!bdev->bd_openers) {
1335		struct module *owner = disk->fops->owner;
1336
1337		put_disk(disk);
1338		module_put(owner);
1339		disk_put_part(bdev->bd_part);
1340		bdev->bd_part = NULL;
1341		bdev->bd_disk = NULL;
1342		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1343		if (bdev != bdev->bd_contains)
1344			victim = bdev->bd_contains;
1345		bdev->bd_contains = NULL;
1346	}
1347	unlock_kernel();
1348	mutex_unlock(&bdev->bd_mutex);
1349	bdput(bdev);
1350	if (victim)
1351		__blkdev_put(victim, mode, 1);
1352	return ret;
1353}
1354
1355int blkdev_put(struct block_device *bdev, fmode_t mode)
1356{
1357	return __blkdev_put(bdev, mode, 0);
1358}
1359EXPORT_SYMBOL(blkdev_put);
1360
1361static int blkdev_close(struct inode * inode, struct file * filp)
1362{
1363	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1364	if (bdev->bd_holder == filp)
1365		bd_release(bdev);
1366	return blkdev_put(bdev, filp->f_mode);
1367}
1368
1369static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1370{
1371	struct block_device *bdev = I_BDEV(file->f_mapping->host);
1372	fmode_t mode = file->f_mode;
1373
1374	/*
1375	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
1376	 * to updated it before every ioctl.
1377	 */
1378	if (file->f_flags & O_NDELAY)
1379		mode |= FMODE_NDELAY;
1380	else
1381		mode &= ~FMODE_NDELAY;
1382
1383	return blkdev_ioctl(bdev, mode, cmd, arg);
1384}
1385
1386/*
1387 * Try to release a page associated with block device when the system
1388 * is under memory pressure.
1389 */
1390static int blkdev_releasepage(struct page *page, gfp_t wait)
1391{
1392	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1393
1394	if (super && super->s_op->bdev_try_to_free_page)
1395		return super->s_op->bdev_try_to_free_page(super, page, wait);
1396
1397	return try_to_free_buffers(page);
1398}
1399
1400static const struct address_space_operations def_blk_aops = {
1401	.readpage	= blkdev_readpage,
1402	.writepage	= blkdev_writepage,
1403	.sync_page	= block_sync_page,
1404	.write_begin	= blkdev_write_begin,
1405	.write_end	= blkdev_write_end,
1406	.writepages	= generic_writepages,
1407	.releasepage	= blkdev_releasepage,
1408	.direct_IO	= blkdev_direct_IO,
1409};
1410
1411const struct file_operations def_blk_fops = {
1412	.open		= blkdev_open,
1413	.release	= blkdev_close,
1414	.llseek		= block_llseek,
1415	.read		= do_sync_read,
1416	.write		= do_sync_write,
1417  	.aio_read	= generic_file_aio_read,
1418  	.aio_write	= generic_file_aio_write_nolock,
1419	.mmap		= generic_file_mmap,
1420	.fsync		= block_fsync,
1421	.unlocked_ioctl	= block_ioctl,
1422#ifdef CONFIG_COMPAT
1423	.compat_ioctl	= compat_blkdev_ioctl,
1424#endif
1425	.splice_read	= generic_file_splice_read,
1426	.splice_write	= generic_file_splice_write,
1427};
1428
1429int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
1430{
1431	int res;
1432	mm_segment_t old_fs = get_fs();
1433	set_fs(KERNEL_DS);
1434	res = blkdev_ioctl(bdev, 0, cmd, arg);
1435	set_fs(old_fs);
1436	return res;
1437}
1438
1439EXPORT_SYMBOL(ioctl_by_bdev);
1440
1441/**
1442 * lookup_bdev  - lookup a struct block_device by name
1443 * @pathname:	special file representing the block device
1444 *
1445 * Get a reference to the blockdevice at @pathname in the current
1446 * namespace if possible and return it.  Return ERR_PTR(error)
1447 * otherwise.
1448 */
1449struct block_device *lookup_bdev(const char *pathname)
1450{
1451	struct block_device *bdev;
1452	struct inode *inode;
1453	struct path path;
1454	int error;
1455
1456	if (!pathname || !*pathname)
1457		return ERR_PTR(-EINVAL);
1458
1459	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1460	if (error)
1461		return ERR_PTR(error);
1462
1463	inode = path.dentry->d_inode;
1464	error = -ENOTBLK;
1465	if (!S_ISBLK(inode->i_mode))
1466		goto fail;
1467	error = -EACCES;
1468	if (path.mnt->mnt_flags & MNT_NODEV)
1469		goto fail;
1470	error = -ENOMEM;
1471	bdev = bd_acquire(inode);
1472	if (!bdev)
1473		goto fail;
1474out:
1475	path_put(&path);
1476	return bdev;
1477fail:
1478	bdev = ERR_PTR(error);
1479	goto out;
1480}
1481EXPORT_SYMBOL(lookup_bdev);
1482
1483/**
1484 * open_bdev_exclusive  -  open a block device by name and set it up for use
1485 *
1486 * @path:	special file representing the block device
1487 * @mode:	FMODE_... combination to pass be used
1488 * @holder:	owner for exclusion
1489 *
1490 * Open the blockdevice described by the special file at @path, claim it
1491 * for the @holder.
1492 */
1493struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1494{
1495	struct block_device *bdev;
1496	int error = 0;
1497
1498	bdev = lookup_bdev(path);
1499	if (IS_ERR(bdev))
1500		return bdev;
1501
1502	error = blkdev_get(bdev, mode);
1503	if (error)
1504		return ERR_PTR(error);
1505	error = -EACCES;
1506	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1507		goto blkdev_put;
1508	error = bd_claim(bdev, holder);
1509	if (error)
1510		goto blkdev_put;
1511
1512	return bdev;
1513	
1514blkdev_put:
1515	blkdev_put(bdev, mode);
1516	return ERR_PTR(error);
1517}
1518
1519EXPORT_SYMBOL(open_bdev_exclusive);
1520
1521/**
1522 * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
1523 *
1524 * @bdev:	blockdevice to close
1525 * @mode:	mode, must match that used to open.
1526 *
1527 * This is the counterpart to open_bdev_exclusive().
1528 */
1529void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1530{
1531	bd_release(bdev);
1532	blkdev_put(bdev, mode);
1533}
1534
1535EXPORT_SYMBOL(close_bdev_exclusive);
1536
1537int __invalidate_device(struct block_device *bdev)
1538{
1539	struct super_block *sb = get_super(bdev);
1540	int res = 0;
1541
1542	if (sb) {
1543		/*
1544		 * no need to lock the super, get_super holds the
1545		 * read mutex so the filesystem cannot go away
1546		 * under us (->put_super runs with the write lock
1547		 * hold).
1548		 */
1549		shrink_dcache_sb(sb);
1550		res = invalidate_inodes(sb);
1551		drop_super(sb);
1552	}
1553	invalidate_bdev(bdev);
1554	return res;
1555}
1556EXPORT_SYMBOL(__invalidate_device);