Merge branch 'block-dev'

Merge 'block-dev' branch.

I was going to just mark everything here for stable and leave it to the
3.8 merge window, but having decided on doing another -rc, I migth as
well merge it now.

This removes the bd_block_size_semaphore semaphore that was added in
this release to fix a race condition between block size changes and
block IO, and replaces it with atomicity guaratees in fs/buffer.c
instead, along with simplifying fs/block-dev.c.

This removes more lines than it adds, makes the code generally simpler,
and avoids the latency/rt issues that the block size semaphore
introduced for mount.

I'm not happy with the timing, but it wouldn't be much better doing this
during the merge window and then having some delayed back-port of it
into stable.

* block-dev:
blkdev_max_block: make private to fs/buffer.c
direct-io: don't read inode->i_blkbits multiple times
blockdev: remove bd_block_size_semaphore again
fs/buffer.c: make block-size be per-page and protected by the page lock

+72 -196
+1 -1
drivers/char/raw.c
··· 285 285 286 286 static const struct file_operations raw_fops = { 287 287 .read = do_sync_read, 288 - .aio_read = blkdev_aio_read, 288 + .aio_read = generic_file_aio_read, 289 289 .write = do_sync_write, 290 290 .aio_write = blkdev_aio_write, 291 291 .fsync = blkdev_fsync,
+5 -155
fs/block_dev.c
··· 70 70 spin_unlock(&dst->wb.list_lock); 71 71 } 72 72 73 - sector_t blkdev_max_block(struct block_device *bdev) 74 - { 75 - sector_t retval = ~((sector_t)0); 76 - loff_t sz = i_size_read(bdev->bd_inode); 77 - 78 - if (sz) { 79 - unsigned int size = block_size(bdev); 80 - unsigned int sizebits = blksize_bits(size); 81 - retval = (sz >> sizebits); 82 - } 83 - return retval; 84 - } 85 - 86 73 /* Kill _all_ buffers and pagecache , dirty or not.. */ 87 74 void kill_bdev(struct block_device *bdev) 88 75 { ··· 103 116 104 117 int set_blocksize(struct block_device *bdev, int size) 105 118 { 106 - struct address_space *mapping; 107 - 108 119 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 109 120 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 110 121 return -EINVAL; ··· 111 126 if (size < bdev_logical_block_size(bdev)) 112 127 return -EINVAL; 113 128 114 - /* Prevent starting I/O or mapping the device */ 115 - percpu_down_write(&bdev->bd_block_size_semaphore); 116 - 117 - /* Check that the block device is not memory mapped */ 118 - mapping = bdev->bd_inode->i_mapping; 119 - mutex_lock(&mapping->i_mmap_mutex); 120 - if (mapping_mapped(mapping)) { 121 - mutex_unlock(&mapping->i_mmap_mutex); 122 - percpu_up_write(&bdev->bd_block_size_semaphore); 123 - return -EBUSY; 124 - } 125 - mutex_unlock(&mapping->i_mmap_mutex); 126 - 127 129 /* Don't change the size if it is same as current */ 128 130 if (bdev->bd_block_size != size) { 129 131 sync_blockdev(bdev); ··· 118 146 bdev->bd_inode->i_blkbits = blksize_bits(size); 119 147 kill_bdev(bdev); 120 148 } 121 - 122 - percpu_up_write(&bdev->bd_block_size_semaphore); 123 - 124 149 return 0; 125 150 } 126 151 ··· 150 181 blkdev_get_block(struct inode *inode, sector_t iblock, 151 182 struct buffer_head *bh, int create) 152 183 { 153 - if (iblock >= blkdev_max_block(I_BDEV(inode))) { 154 - if (create) 155 - return -EIO; 156 - 157 - /* 158 - * for reads, we're just trying to fill a partial page. 159 - * return a hole, they will have to call get_block again 160 - * before they can fill it, and they will get -EIO at that 161 - * time 162 - */ 163 - return 0; 164 - } 165 184 bh->b_bdev = I_BDEV(inode); 166 185 bh->b_blocknr = iblock; 167 186 set_buffer_mapped(bh); 168 - return 0; 169 - } 170 - 171 - static int 172 - blkdev_get_blocks(struct inode *inode, sector_t iblock, 173 - struct buffer_head *bh, int create) 174 - { 175 - sector_t end_block = blkdev_max_block(I_BDEV(inode)); 176 - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 177 - 178 - if ((iblock + max_blocks) > end_block) { 179 - max_blocks = end_block - iblock; 180 - if ((long)max_blocks <= 0) { 181 - if (create) 182 - return -EIO; /* write fully beyond EOF */ 183 - /* 184 - * It is a read which is fully beyond EOF. We return 185 - * a !buffer_mapped buffer 186 - */ 187 - max_blocks = 0; 188 - } 189 - } 190 - 191 - bh->b_bdev = I_BDEV(inode); 192 - bh->b_blocknr = iblock; 193 - bh->b_size = max_blocks << inode->i_blkbits; 194 - if (max_blocks) 195 - set_buffer_mapped(bh); 196 187 return 0; 197 188 } 198 189 ··· 164 235 struct inode *inode = file->f_mapping->host; 165 236 166 237 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 167 - nr_segs, blkdev_get_blocks, NULL, NULL, 0); 238 + nr_segs, blkdev_get_block, NULL, NULL, 0); 168 239 } 169 240 170 241 int __sync_blockdev(struct block_device *bdev, int wait) ··· 388 459 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 389 460 if (!ei) 390 461 return NULL; 391 - 392 - if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { 393 - kmem_cache_free(bdev_cachep, ei); 394 - return NULL; 395 - } 396 - 397 462 return &ei->vfs_inode; 398 463 } 399 464 ··· 395 472 { 396 473 struct inode *inode = container_of(head, struct inode, i_rcu); 397 474 struct bdev_inode *bdi = BDEV_I(inode); 398 - 399 - percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); 400 475 401 476 kmem_cache_free(bdev_cachep, bdi); 402 477 } ··· 1514 1593 return blkdev_ioctl(bdev, mode, cmd, arg); 1515 1594 } 1516 1595 1517 - ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, 1518 - unsigned long nr_segs, loff_t pos) 1519 - { 1520 - ssize_t ret; 1521 - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 1522 - 1523 - percpu_down_read(&bdev->bd_block_size_semaphore); 1524 - 1525 - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 1526 - 1527 - percpu_up_read(&bdev->bd_block_size_semaphore); 1528 - 1529 - return ret; 1530 - } 1531 - EXPORT_SYMBOL_GPL(blkdev_aio_read); 1532 - 1533 1596 /* 1534 1597 * Write data to the block device. Only intended for the block device itself 1535 1598 * and the raw driver which basically is a fake block device. ··· 1525 1620 unsigned long nr_segs, loff_t pos) 1526 1621 { 1527 1622 struct file *file = iocb->ki_filp; 1528 - struct block_device *bdev = I_BDEV(file->f_mapping->host); 1529 1623 struct blk_plug plug; 1530 1624 ssize_t ret; 1531 1625 1532 1626 BUG_ON(iocb->ki_pos != pos); 1533 1627 1534 1628 blk_start_plug(&plug); 1535 - 1536 - percpu_down_read(&bdev->bd_block_size_semaphore); 1537 - 1538 1629 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1539 1630 if (ret > 0 || ret == -EIOCBQUEUED) { 1540 1631 ssize_t err; ··· 1539 1638 if (err < 0 && ret > 0) 1540 1639 ret = err; 1541 1640 } 1542 - 1543 - percpu_up_read(&bdev->bd_block_size_semaphore); 1544 - 1545 1641 blk_finish_plug(&plug); 1546 - 1547 1642 return ret; 1548 1643 } 1549 1644 EXPORT_SYMBOL_GPL(blkdev_aio_write); 1550 - 1551 - static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) 1552 - { 1553 - int ret; 1554 - struct block_device *bdev = I_BDEV(file->f_mapping->host); 1555 - 1556 - percpu_down_read(&bdev->bd_block_size_semaphore); 1557 - 1558 - ret = generic_file_mmap(file, vma); 1559 - 1560 - percpu_up_read(&bdev->bd_block_size_semaphore); 1561 - 1562 - return ret; 1563 - } 1564 - 1565 - static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, 1566 - struct pipe_inode_info *pipe, size_t len, 1567 - unsigned int flags) 1568 - { 1569 - ssize_t ret; 1570 - struct block_device *bdev = I_BDEV(file->f_mapping->host); 1571 - 1572 - percpu_down_read(&bdev->bd_block_size_semaphore); 1573 - 1574 - ret = generic_file_splice_read(file, ppos, pipe, len, flags); 1575 - 1576 - percpu_up_read(&bdev->bd_block_size_semaphore); 1577 - 1578 - return ret; 1579 - } 1580 - 1581 - static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, 1582 - struct file *file, loff_t *ppos, size_t len, 1583 - unsigned int flags) 1584 - { 1585 - ssize_t ret; 1586 - struct block_device *bdev = I_BDEV(file->f_mapping->host); 1587 - 1588 - percpu_down_read(&bdev->bd_block_size_semaphore); 1589 - 1590 - ret = generic_file_splice_write(pipe, file, ppos, len, flags); 1591 - 1592 - percpu_up_read(&bdev->bd_block_size_semaphore); 1593 - 1594 - return ret; 1595 - } 1596 - 1597 1645 1598 1646 /* 1599 1647 * Try to release a page associated with block device when the system ··· 1574 1724 .llseek = block_llseek, 1575 1725 .read = do_sync_read, 1576 1726 .write = do_sync_write, 1577 - .aio_read = blkdev_aio_read, 1727 + .aio_read = generic_file_aio_read, 1578 1728 .aio_write = blkdev_aio_write, 1579 - .mmap = blkdev_mmap, 1729 + .mmap = generic_file_mmap, 1580 1730 .fsync = blkdev_fsync, 1581 1731 .unlocked_ioctl = block_ioctl, 1582 1732 #ifdef CONFIG_COMPAT 1583 1733 .compat_ioctl = compat_blkdev_ioctl, 1584 1734 #endif 1585 - .splice_read = blkdev_splice_read, 1586 - .splice_write = blkdev_splice_write, 1735 + .splice_read = generic_file_splice_read, 1736 + .splice_write = generic_file_splice_write, 1587 1737 }; 1588 1738 1589 1739 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
+61 -32
fs/buffer.c
··· 911 911 attach_page_buffers(page, head); 912 912 } 913 913 914 + static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) 915 + { 916 + sector_t retval = ~((sector_t)0); 917 + loff_t sz = i_size_read(bdev->bd_inode); 918 + 919 + if (sz) { 920 + unsigned int sizebits = blksize_bits(size); 921 + retval = (sz >> sizebits); 922 + } 923 + return retval; 924 + } 925 + 914 926 /* 915 927 * Initialise the state of a blockdev page's buffers. 916 928 */ ··· 933 921 struct buffer_head *head = page_buffers(page); 934 922 struct buffer_head *bh = head; 935 923 int uptodate = PageUptodate(page); 936 - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); 924 + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); 937 925 938 926 do { 939 927 if (!buffer_mapped(bh)) { ··· 1565 1553 EXPORT_SYMBOL(unmap_underlying_metadata); 1566 1554 1567 1555 /* 1556 + * Size is a power-of-two in the range 512..PAGE_SIZE, 1557 + * and the case we care about most is PAGE_SIZE. 1558 + * 1559 + * So this *could* possibly be written with those 1560 + * constraints in mind (relevant mostly if some 1561 + * architecture has a slow bit-scan instruction) 1562 + */ 1563 + static inline int block_size_bits(unsigned int blocksize) 1564 + { 1565 + return ilog2(blocksize); 1566 + } 1567 + 1568 + static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) 1569 + { 1570 + BUG_ON(!PageLocked(page)); 1571 + 1572 + if (!page_has_buffers(page)) 1573 + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); 1574 + return page_buffers(page); 1575 + } 1576 + 1577 + /* 1568 1578 * NOTE! All mapped/uptodate combinations are valid: 1569 1579 * 1570 1580 * Mapped Uptodate Meaning ··· 1623 1589 sector_t block; 1624 1590 sector_t last_block; 1625 1591 struct buffer_head *bh, *head; 1626 - const unsigned blocksize = 1 << inode->i_blkbits; 1592 + unsigned int blocksize, bbits; 1627 1593 int nr_underway = 0; 1628 1594 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1629 1595 WRITE_SYNC : WRITE); 1630 1596 1631 - BUG_ON(!PageLocked(page)); 1632 - 1633 - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; 1634 - 1635 - if (!page_has_buffers(page)) { 1636 - create_empty_buffers(page, blocksize, 1597 + head = create_page_buffers(page, inode, 1637 1598 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1638 - } 1639 1599 1640 1600 /* 1641 1601 * Be very careful. We have no exclusion from __set_page_dirty_buffers ··· 1641 1613 * handle that here by just cleaning them. 1642 1614 */ 1643 1615 1644 - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1645 - head = page_buffers(page); 1646 1616 bh = head; 1617 + blocksize = bh->b_size; 1618 + bbits = block_size_bits(blocksize); 1619 + 1620 + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1621 + last_block = (i_size_read(inode) - 1) >> bbits; 1647 1622 1648 1623 /* 1649 1624 * Get all the dirty buffers mapped to disk addresses and ··· 1837 1806 BUG_ON(to > PAGE_CACHE_SIZE); 1838 1807 BUG_ON(from > to); 1839 1808 1840 - blocksize = 1 << inode->i_blkbits; 1841 - if (!page_has_buffers(page)) 1842 - create_empty_buffers(page, blocksize, 0); 1843 - head = page_buffers(page); 1809 + head = create_page_buffers(page, inode, 0); 1810 + blocksize = head->b_size; 1811 + bbits = block_size_bits(blocksize); 1844 1812 1845 - bbits = inode->i_blkbits; 1846 1813 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1847 1814 1848 1815 for(bh = head, block_start = 0; bh != head || !block_start; ··· 1910 1881 unsigned blocksize; 1911 1882 struct buffer_head *bh, *head; 1912 1883 1913 - blocksize = 1 << inode->i_blkbits; 1884 + bh = head = page_buffers(page); 1885 + blocksize = bh->b_size; 1914 1886 1915 - for(bh = head = page_buffers(page), block_start = 0; 1916 - bh != head || !block_start; 1917 - block_start=block_end, bh = bh->b_this_page) { 1887 + block_start = 0; 1888 + do { 1918 1889 block_end = block_start + blocksize; 1919 1890 if (block_end <= from || block_start >= to) { 1920 1891 if (!buffer_uptodate(bh)) ··· 1924 1895 mark_buffer_dirty(bh); 1925 1896 } 1926 1897 clear_buffer_new(bh); 1927 - } 1898 + 1899 + block_start = block_end; 1900 + bh = bh->b_this_page; 1901 + } while (bh != head); 1928 1902 1929 1903 /* 1930 1904 * If this is a partial write which happened to make all buffers ··· 2052 2020 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2053 2021 unsigned long from) 2054 2022 { 2055 - struct inode *inode = page->mapping->host; 2056 2023 unsigned block_start, block_end, blocksize; 2057 2024 unsigned to; 2058 2025 struct buffer_head *bh, *head; ··· 2060 2029 if (!page_has_buffers(page)) 2061 2030 return 0; 2062 2031 2063 - blocksize = 1 << inode->i_blkbits; 2032 + head = page_buffers(page); 2033 + blocksize = head->b_size; 2064 2034 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2065 2035 to = from + to; 2066 2036 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2067 2037 return 0; 2068 2038 2069 - head = page_buffers(page); 2070 2039 bh = head; 2071 2040 block_start = 0; 2072 2041 do { ··· 2099 2068 struct inode *inode = page->mapping->host; 2100 2069 sector_t iblock, lblock; 2101 2070 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2102 - unsigned int blocksize; 2071 + unsigned int blocksize, bbits; 2103 2072 int nr, i; 2104 2073 int fully_mapped = 1; 2105 2074 2106 - BUG_ON(!PageLocked(page)); 2107 - blocksize = 1 << inode->i_blkbits; 2108 - if (!page_has_buffers(page)) 2109 - create_empty_buffers(page, blocksize, 0); 2110 - head = page_buffers(page); 2075 + head = create_page_buffers(page, inode, 0); 2076 + blocksize = head->b_size; 2077 + bbits = block_size_bits(blocksize); 2111 2078 2112 - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2113 - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; 2079 + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 2080 + lblock = (i_size_read(inode)+blocksize-1) >> bbits; 2114 2081 bh = head; 2115 2082 nr = 0; 2116 2083 i = 0;
+5 -3
fs/direct-io.c
··· 540 540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ 541 541 unsigned long fs_count; /* Number of filesystem-sized blocks */ 542 542 int create; 543 + unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; 543 544 544 545 /* 545 546 * If there was a memory error and we've overwritten all the ··· 555 554 fs_count = fs_endblk - fs_startblk + 1; 556 555 557 556 map_bh->b_state = 0; 558 - map_bh->b_size = fs_count << dio->inode->i_blkbits; 557 + map_bh->b_size = fs_count << i_blkbits; 559 558 560 559 /* 561 560 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we ··· 1054 1053 int seg; 1055 1054 size_t size; 1056 1055 unsigned long addr; 1057 - unsigned blkbits = inode->i_blkbits; 1056 + unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); 1057 + unsigned blkbits = i_blkbits; 1058 1058 unsigned blocksize_mask = (1 << blkbits) - 1; 1059 1059 ssize_t retval = -EINVAL; 1060 1060 loff_t end = offset; ··· 1151 1149 dio->inode = inode; 1152 1150 dio->rw = rw; 1153 1151 sdio.blkbits = blkbits; 1154 - sdio.blkfactor = inode->i_blkbits - blkbits; 1152 + sdio.blkfactor = i_blkbits - blkbits; 1155 1153 sdio.block_in_file = offset >> blkbits; 1156 1154 1157 1155 sdio.get_block = get_block;
-5
include/linux/fs.h
··· 462 462 int bd_fsfreeze_count; 463 463 /* Mutex for freeze */ 464 464 struct mutex bd_fsfreeze_mutex; 465 - /* A semaphore that prevents I/O while block size is being changed */ 466 - struct percpu_rw_semaphore bd_block_size_semaphore; 467 465 }; 468 466 469 467 /* ··· 2047 2049 extern struct block_device *bdget(dev_t); 2048 2050 extern struct block_device *bdgrab(struct block_device *bdev); 2049 2051 extern void bd_set_size(struct block_device *, loff_t size); 2050 - extern sector_t blkdev_max_block(struct block_device *bdev); 2051 2052 extern void bd_forget(struct inode *inode); 2052 2053 extern void bdput(struct block_device *); 2053 2054 extern void invalidate_bdev(struct block_device *); ··· 2376 2379 unsigned long *nr_segs, size_t *count, int access_flags); 2377 2380 2378 2381 /* fs/block_dev.c */ 2379 - extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, 2380 - unsigned long nr_segs, loff_t pos); 2381 2382 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 2382 2383 unsigned long nr_segs, loff_t pos); 2383 2384 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,