ext4: journal credit fix for the delayed allocation's writepages() function

Previous delalloc writepages implementation started a new transaction
outside of a loop which called get_block() to do the block allocation.
Since we didn't know exactly how many blocks would need to be allocated,
the estimated journal credits required was very conservative and caused
many issues.

With the reworked delayed allocation, a new transaction is created for
each get_block(), thus we don't need to guess how many credits for the
multiple chunk of allocation. We start every transaction with enough
credits for inserting a single exent. When estimate the credits for
indirect blocks to allocate a chunk of blocks, we need to know the
number of data blocks to allocate. We use the total number of reserved
delalloc datablocks; if that is too big, for non-extent files, we need
to limit the number of blocks to EXT4_MAX_TRANS_BLOCKS.

Code cleanup from Aneesh.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Reviewed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Mingming Cao and committed by Theodore Ts'o 525f4ed8 a1d6cc56

+58 -24
+4 -4
fs/ext4/extents.c
··· 1753 * When pass the actual path, the caller should calculate credits 1754 * under i_data_sem. 1755 */ 1756 - int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, 1757 struct ext4_ext_path *path) 1758 { 1759 if (path) { ··· 1772 * and other metadat blocks still need to be 1773 * accounted. 1774 */ 1775 - /* 1 one bitmap, 1 block group descriptor */ 1776 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 1777 } 1778 } 1779 1780 - return ext4_chunk_trans_blocks(inode, num); 1781 } 1782 1783 /* ··· 1791 * If the nrblocks are discontiguous, they could cause 1792 * the whole tree split more than once, but this is really rare. 1793 */ 1794 - int ext4_ext_index_trans_blocks(struct inode *inode, int num, int chunk) 1795 { 1796 int index; 1797 int depth = ext_depth(inode);
··· 1753 * When pass the actual path, the caller should calculate credits 1754 * under i_data_sem. 1755 */ 1756 + int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 1757 struct ext4_ext_path *path) 1758 { 1759 if (path) { ··· 1772 * and other metadat blocks still need to be 1773 * accounted. 1774 */ 1775 + /* 1 bitmap, 1 block group descriptor */ 1776 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 1777 } 1778 } 1779 1780 + return ext4_chunk_trans_blocks(inode, nrblocks); 1781 } 1782 1783 /* ··· 1791 * If the nrblocks are discontiguous, they could cause 1792 * the whole tree split more than once, but this is really rare. 1793 */ 1794 + int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 1795 { 1796 int index; 1797 int depth = ext_depth(inode);
+54 -20
fs/ext4/inode.c
··· 1848 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1849 sector_t logical, struct buffer_head *bh) 1850 { 1851 - struct buffer_head *lbh = &mpd->lbh; 1852 sector_t next; 1853 1854 - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); 1855 - 1856 /* 1857 * First block in the extent 1858 */ 1859 if (lbh->b_size == 0) { 1860 lbh->b_blocknr = logical; 1861 - lbh->b_size = bh->b_size; 1862 lbh->b_state = bh->b_state & BH_FLAGS; 1863 return; 1864 } 1865 1866 /* 1867 * Can we merge the block to our big extent? 1868 */ 1869 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 1870 - lbh->b_size += bh->b_size; 1871 return; 1872 } 1873 1874 /* 1875 * We couldn't merge the block to our extent, so we 1876 * need to flush current extent and start new one ··· 2255 } 2256 2257 /* 2258 - * For now just follow the DIO way to estimate the max credits 2259 - * needed to write out EXT4_MAX_WRITEBACK_PAGES. 2260 - * todo: need to calculate the max credits need for 2261 - * extent based files, currently the DIO credits is based on 2262 - * indirect-blocks mapping way. 2263 - * 2264 - * Probably should have a generic way to calculate credits 2265 - * for DIO, writepages, and truncate 2266 */ 2267 - #define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS 2268 - #define EXT4_MAX_WRITEBACK_CREDITS 25 2269 2270 static int ext4_da_writepages(struct address_space *mapping, 2271 struct writeback_control *wbc) ··· 2319 * by delalloc 2320 */ 2321 BUG_ON(ext4_should_journal_data(inode)); 2322 - needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); 2323 2324 /* start a new transaction*/ 2325 handle = ext4_journal_start(inode, needed_blocks); ··· 4497 * the modification of a single pages into a single transaction, 4498 * which may include multiple chunks of block allocations. 4499 * 4500 - * This could be called via ext4_write_begin() or later 4501 - * ext4_da_writepages() in delalyed allocation case. 4502 * 4503 - * In both case it's possible that we could allocating multiple 4504 - * chunks of blocks. We need to consider the worse case, when 4505 * one new block per extent. 4506 */ 4507 int ext4_writepage_trans_blocks(struct inode *inode)
··· 1848 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1849 sector_t logical, struct buffer_head *bh) 1850 { 1851 sector_t next; 1852 + size_t b_size = bh->b_size; 1853 + struct buffer_head *lbh = &mpd->lbh; 1854 + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; 1855 1856 + /* check if thereserved journal credits might overflow */ 1857 + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 1858 + if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1859 + /* 1860 + * With non-extent format we are limited by the journal 1861 + * credit available. Total credit needed to insert 1862 + * nrblocks contiguous blocks is dependent on the 1863 + * nrblocks. So limit nrblocks. 1864 + */ 1865 + goto flush_it; 1866 + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 1867 + EXT4_MAX_TRANS_DATA) { 1868 + /* 1869 + * Adding the new buffer_head would make it cross the 1870 + * allowed limit for which we have journal credit 1871 + * reserved. So limit the new bh->b_size 1872 + */ 1873 + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 1874 + mpd->inode->i_blkbits; 1875 + /* we will do mpage_da_submit_io in the next loop */ 1876 + } 1877 + } 1878 /* 1879 * First block in the extent 1880 */ 1881 if (lbh->b_size == 0) { 1882 lbh->b_blocknr = logical; 1883 + lbh->b_size = b_size; 1884 lbh->b_state = bh->b_state & BH_FLAGS; 1885 return; 1886 } 1887 1888 + next = lbh->b_blocknr + nrblocks; 1889 /* 1890 * Can we merge the block to our big extent? 1891 */ 1892 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 1893 + lbh->b_size += b_size; 1894 return; 1895 } 1896 1897 + flush_it: 1898 /* 1899 * We couldn't merge the block to our extent, so we 1900 * need to flush current extent and start new one ··· 2231 } 2232 2233 /* 2234 + * This is called via ext4_da_writepages() to 2235 + * calulate the total number of credits to reserve to fit 2236 + * a single extent allocation into a single transaction, 2237 + * ext4_da_writpeages() will loop calling this before 2238 + * the block allocation. 2239 */ 2240 + 2241 + static int ext4_da_writepages_trans_blocks(struct inode *inode) 2242 + { 2243 + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2244 + 2245 + /* 2246 + * With non-extent format the journal credit needed to 2247 + * insert nrblocks contiguous block is dependent on 2248 + * number of contiguous block. So we will limit 2249 + * number of contiguous block to a sane value 2250 + */ 2251 + if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2252 + (max_blocks > EXT4_MAX_TRANS_DATA)) 2253 + max_blocks = EXT4_MAX_TRANS_DATA; 2254 + 2255 + return ext4_chunk_trans_blocks(inode, max_blocks); 2256 + } 2257 2258 static int ext4_da_writepages(struct address_space *mapping, 2259 struct writeback_control *wbc) ··· 2283 * by delalloc 2284 */ 2285 BUG_ON(ext4_should_journal_data(inode)); 2286 + needed_blocks = ext4_da_writepages_trans_blocks(inode); 2287 2288 /* start a new transaction*/ 2289 handle = ext4_journal_start(inode, needed_blocks); ··· 4461 * the modification of a single pages into a single transaction, 4462 * which may include multiple chunks of block allocations. 4463 * 4464 + * This could be called via ext4_write_begin() 4465 * 4466 + * We need to consider the worse case, when 4467 * one new block per extent. 4468 */ 4469 int ext4_writepage_trans_blocks(struct inode *inode)