ext4: Rework the ext4_da_writepages() function

With the below changes we reserve credit needed to insert only one
extent resulting from a call to single get_block. This makes sure we
don't take too much journal credits during writeout. We also don't
limit the pages to write. That means we loop through the dirty pages
building largest possible contiguous block request. Then we issue a
single get_block request. We may get less block that we requested. If
so we would end up not mapping some of the buffer_heads. That means
those buffer_heads are still marked delay. Later in the writepage
callback via __mpage_writepage we redirty those pages.

We should also not limit/throttle wbc->nr_to_write in the filesystem
writepages callback. That cause wrong behaviour in
generic_sync_sb_inodes caused by wbc->nr_to_write being <= 0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Aneesh Kumar K.V and committed by Theodore Ts'o a1d6cc56 f3bd1f3f

+113 -88
+113 -88
fs/ext4/inode.c
··· 41 #include "acl.h" 42 #include "ext4_extents.h" 43 44 static inline int ext4_begin_ordered_truncate(struct inode *inode, 45 loff_t new_size) 46 { ··· 1628 unsigned long first_page, next_page; /* extent of pages */ 1629 get_block_t *get_block; 1630 struct writeback_control *wbc; 1631 }; 1632 1633 /* 1634 * mpage_da_submit_io - walks through extent of pages and try to write 1635 - * them with __mpage_writepage() 1636 * 1637 * @mpd->inode: inode 1638 * @mpd->first_page: first page of the extent ··· 1649 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1650 { 1651 struct address_space *mapping = mpd->inode->i_mapping; 1652 - struct mpage_data mpd_pp = { 1653 - .bio = NULL, 1654 - .last_block_in_bio = 0, 1655 - .get_block = mpd->get_block, 1656 - .use_writepage = 1, 1657 - }; 1658 int ret = 0, err, nr_pages, i; 1659 unsigned long index, end; 1660 struct pagevec pvec; 1661 1662 BUG_ON(mpd->next_page <= mpd->first_page); 1663 - 1664 pagevec_init(&pvec, 0); 1665 index = mpd->first_page; 1666 end = mpd->next_page - 1; ··· 1671 break; 1672 index++; 1673 1674 - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); 1675 - 1676 /* 1677 * In error case, we have to continue because 1678 * remaining pages are still locked ··· 1684 } 1685 pagevec_release(&pvec); 1686 } 1687 - if (mpd_pp.bio) 1688 - mpage_bio_submit(WRITE, mpd_pp.bio); 1689 - 1690 return ret; 1691 } 1692 ··· 1706 int blocks = exbh->b_size >> inode->i_blkbits; 1707 sector_t pblock = exbh->b_blocknr, cur_logical; 1708 struct buffer_head *head, *bh; 1709 - unsigned long index, end; 1710 struct pagevec pvec; 1711 int nr_pages, i; 1712 ··· 1791 * 1792 * The function skips space we know is already mapped to disk blocks. 1793 * 1794 - * The function ignores errors ->get_block() returns, thus real 1795 - * error handling is postponed to __mpage_writepage() 1796 */ 1797 static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1798 { 1799 struct buffer_head *lbh = &mpd->lbh; 1800 - int err = 0, remain = lbh->b_size; 1801 sector_t next = lbh->b_blocknr; 1802 struct buffer_head new; 1803 ··· 1805 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1806 return; 1807 1808 - while (remain) { 1809 - new.b_state = lbh->b_state; 1810 - new.b_blocknr = 0; 1811 - new.b_size = remain; 1812 - err = mpd->get_block(mpd->inode, next, &new, 1); 1813 - if (err) { 1814 - /* 1815 - * Rather than implement own error handling 1816 - * here, we just leave remaining blocks 1817 - * unallocated and try again with ->writepage() 1818 - */ 1819 - break; 1820 - } 1821 - BUG_ON(new.b_size == 0); 1822 1823 - if (buffer_new(&new)) 1824 - __unmap_underlying_blocks(mpd->inode, &new); 1825 1826 - /* 1827 - * If blocks are delayed marked, we need to 1828 - * put actual blocknr and drop delayed bit 1829 - */ 1830 - if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1831 - mpage_put_bnr_to_bhs(mpd, next, &new); 1832 1833 - /* go for the remaining blocks */ 1834 - next += new.b_size >> mpd->inode->i_blkbits; 1835 - remain -= new.b_size; 1836 - } 1837 } 1838 1839 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ ··· 1876 * need to flush current extent and start new one 1877 */ 1878 mpage_da_map_blocks(mpd); 1879 - 1880 - /* 1881 - * Now start a new extent 1882 - */ 1883 - lbh->b_size = bh->b_size; 1884 - lbh->b_state = bh->b_state & BH_FLAGS; 1885 - lbh->b_blocknr = logical; 1886 } 1887 1888 /* ··· 1898 struct buffer_head *bh, *head, fake; 1899 sector_t logical; 1900 1901 /* 1902 * Can we merge this page to current extent? 1903 */ 1904 if (mpd->next_page != page->index) { 1905 /* 1906 * Nope, we can't. So, we map non-allocated blocks 1907 - * and start IO on them using __mpage_writepage() 1908 */ 1909 if (mpd->next_page != mpd->first_page) { 1910 mpage_da_map_blocks(mpd); 1911 mpage_da_submit_io(mpd); 1912 } 1913 1914 /* ··· 1957 set_buffer_dirty(bh); 1958 set_buffer_uptodate(bh); 1959 mpage_add_bh_to_extent(mpd, logical, bh); 1960 } else { 1961 /* 1962 * Page with regular buffer heads, just add all dirty ones ··· 1967 bh = head; 1968 do { 1969 BUG_ON(buffer_locked(bh)); 1970 - if (buffer_dirty(bh)) 1971 mpage_add_bh_to_extent(mpd, logical, bh); 1972 logical++; 1973 } while ((bh = bh->b_this_page) != head); 1974 } ··· 1991 * 1992 * This is a library function, which implements the writepages() 1993 * address_space_operation. 1994 - * 1995 - * In order to avoid duplication of logic that deals with partial pages, 1996 - * multiple bio per page, etc, we find non-allocated blocks, allocate 1997 - * them with minimal calls to ->get_block() and re-use __mpage_writepage() 1998 - * 1999 - * It's important that we call __mpage_writepage() only once for each 2000 - * involved page, otherwise we'd have to implement more complicated logic 2001 - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. 2002 - * 2003 - * See comments to mpage_writepages() 2004 */ 2005 static int mpage_da_writepages(struct address_space *mapping, 2006 struct writeback_control *wbc, 2007 get_block_t get_block) 2008 { 2009 struct mpage_da_data mpd; 2010 int ret; 2011 2012 if (!get_block) ··· 2011 mpd.first_page = 0; 2012 mpd.next_page = 0; 2013 mpd.get_block = get_block; 2014 2015 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2016 2017 /* 2018 * Handle last extent of pages 2019 */ 2020 - if (mpd.next_page != mpd.first_page) { 2021 mpage_da_map_blocks(&mpd); 2022 mpage_da_submit_io(&mpd); 2023 } 2024 2025 return ret; 2026 } 2027 ··· 2244 #define EXT4_MAX_WRITEBACK_CREDITS 25 2245 2246 static int ext4_da_writepages(struct address_space *mapping, 2247 - struct writeback_control *wbc) 2248 { 2249 struct inode *inode = mapping->host; 2250 handle_t *handle = NULL; ··· 2252 int ret = 0; 2253 long to_write; 2254 loff_t range_start = 0; 2255 2256 /* 2257 * No pages to write? This is mainly a kludge to avoid starting 2258 * a transaction for special inodes like journal inode on last iput() 2259 * because that could violate lock ordering on umount 2260 */ 2261 - if (!mapping->nrpages) 2262 return 0; 2263 2264 - /* 2265 - * Estimate the worse case needed credits to write out 2266 - * EXT4_MAX_BUF_BLOCKS pages 2267 - */ 2268 - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; 2269 - 2270 - to_write = wbc->nr_to_write; 2271 - if (!wbc->range_cyclic) { 2272 /* 2273 * If range_cyclic is not set force range_cont 2274 * and save the old writeback_index 2275 */ 2276 wbc->range_cont = 1; 2277 - range_start = wbc->range_start; 2278 - } 2279 2280 - while (!ret && to_write) { 2281 /* start a new transaction*/ 2282 handle = ext4_journal_start(inode, needed_blocks); 2283 if (IS_ERR(handle)) { 2284 ret = PTR_ERR(handle); 2285 goto out_writepages; 2286 } 2287 if (ext4_should_order_data(inode)) { 2288 /* 2289 * With ordered mode we need to add 2290 - * the inode to the journal handle 2291 * when we do block allocation. 2292 */ 2293 ret = ext4_jbd2_file_inode(handle, inode); ··· 2306 ext4_journal_stop(handle); 2307 goto out_writepages; 2308 } 2309 - 2310 } 2311 - /* 2312 - * set the max dirty pages could be write at a time 2313 - * to fit into the reserved transaction credits 2314 - */ 2315 - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) 2316 - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; 2317 2318 to_write -= wbc->nr_to_write; 2319 ret = mpage_da_writepages(mapping, wbc, 2320 - ext4_da_get_block_write); 2321 ext4_journal_stop(handle); 2322 - if (wbc->nr_to_write) { 2323 /* 2324 * There is no more writeout needed 2325 * or we requested for a noblocking writeout ··· 2331 wbc->nr_to_write = to_write; 2332 } 2333 2334 out_writepages: 2335 wbc->nr_to_write = to_write; 2336 - if (range_start) 2337 - wbc->range_start = range_start; 2338 return ret; 2339 } 2340
··· 41 #include "acl.h" 42 #include "ext4_extents.h" 43 44 + #define MPAGE_DA_EXTENT_TAIL 0x01 45 + 46 static inline int ext4_begin_ordered_truncate(struct inode *inode, 47 loff_t new_size) 48 { ··· 1626 unsigned long first_page, next_page; /* extent of pages */ 1627 get_block_t *get_block; 1628 struct writeback_control *wbc; 1629 + int io_done; 1630 + long pages_written; 1631 }; 1632 1633 /* 1634 * mpage_da_submit_io - walks through extent of pages and try to write 1635 + * them with writepage() call back 1636 * 1637 * @mpd->inode: inode 1638 * @mpd->first_page: first page of the extent ··· 1645 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1646 { 1647 struct address_space *mapping = mpd->inode->i_mapping; 1648 int ret = 0, err, nr_pages, i; 1649 unsigned long index, end; 1650 struct pagevec pvec; 1651 1652 BUG_ON(mpd->next_page <= mpd->first_page); 1653 pagevec_init(&pvec, 0); 1654 index = mpd->first_page; 1655 end = mpd->next_page - 1; ··· 1674 break; 1675 index++; 1676 1677 + err = mapping->a_ops->writepage(page, mpd->wbc); 1678 + if (!err) 1679 + mpd->pages_written++; 1680 /* 1681 * In error case, we have to continue because 1682 * remaining pages are still locked ··· 1686 } 1687 pagevec_release(&pvec); 1688 } 1689 return ret; 1690 } 1691 ··· 1711 int blocks = exbh->b_size >> inode->i_blkbits; 1712 sector_t pblock = exbh->b_blocknr, cur_logical; 1713 struct buffer_head *head, *bh; 1714 + pgoff_t index, end; 1715 struct pagevec pvec; 1716 int nr_pages, i; 1717 ··· 1796 * 1797 * The function skips space we know is already mapped to disk blocks. 1798 * 1799 */ 1800 static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1801 { 1802 + int err = 0; 1803 struct buffer_head *lbh = &mpd->lbh; 1804 sector_t next = lbh->b_blocknr; 1805 struct buffer_head new; 1806 ··· 1812 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1813 return; 1814 1815 + new.b_state = lbh->b_state; 1816 + new.b_blocknr = 0; 1817 + new.b_size = lbh->b_size; 1818 1819 + /* 1820 + * If we didn't accumulate anything 1821 + * to write simply return 1822 + */ 1823 + if (!new.b_size) 1824 + return; 1825 + err = mpd->get_block(mpd->inode, next, &new, 1); 1826 + if (err) 1827 + return; 1828 + BUG_ON(new.b_size == 0); 1829 1830 + if (buffer_new(&new)) 1831 + __unmap_underlying_blocks(mpd->inode, &new); 1832 1833 + /* 1834 + * If blocks are delayed marked, we need to 1835 + * put actual blocknr and drop delayed bit 1836 + */ 1837 + if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1838 + mpage_put_bnr_to_bhs(mpd, next, &new); 1839 + 1840 + return; 1841 } 1842 1843 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ ··· 1886 * need to flush current extent and start new one 1887 */ 1888 mpage_da_map_blocks(mpd); 1889 + mpage_da_submit_io(mpd); 1890 + mpd->io_done = 1; 1891 + return; 1892 } 1893 1894 /* ··· 1912 struct buffer_head *bh, *head, fake; 1913 sector_t logical; 1914 1915 + if (mpd->io_done) { 1916 + /* 1917 + * Rest of the page in the page_vec 1918 + * redirty then and skip then. We will 1919 + * try to to write them again after 1920 + * starting a new transaction 1921 + */ 1922 + redirty_page_for_writepage(wbc, page); 1923 + unlock_page(page); 1924 + return MPAGE_DA_EXTENT_TAIL; 1925 + } 1926 /* 1927 * Can we merge this page to current extent? 1928 */ 1929 if (mpd->next_page != page->index) { 1930 /* 1931 * Nope, we can't. So, we map non-allocated blocks 1932 + * and start IO on them using writepage() 1933 */ 1934 if (mpd->next_page != mpd->first_page) { 1935 mpage_da_map_blocks(mpd); 1936 mpage_da_submit_io(mpd); 1937 + /* 1938 + * skip rest of the page in the page_vec 1939 + */ 1940 + mpd->io_done = 1; 1941 + redirty_page_for_writepage(wbc, page); 1942 + unlock_page(page); 1943 + return MPAGE_DA_EXTENT_TAIL; 1944 } 1945 1946 /* ··· 1953 set_buffer_dirty(bh); 1954 set_buffer_uptodate(bh); 1955 mpage_add_bh_to_extent(mpd, logical, bh); 1956 + if (mpd->io_done) 1957 + return MPAGE_DA_EXTENT_TAIL; 1958 } else { 1959 /* 1960 * Page with regular buffer heads, just add all dirty ones ··· 1961 bh = head; 1962 do { 1963 BUG_ON(buffer_locked(bh)); 1964 + if (buffer_dirty(bh) && 1965 + (!buffer_mapped(bh) || buffer_delay(bh))) { 1966 mpage_add_bh_to_extent(mpd, logical, bh); 1967 + if (mpd->io_done) 1968 + return MPAGE_DA_EXTENT_TAIL; 1969 + } 1970 logical++; 1971 } while ((bh = bh->b_this_page) != head); 1972 } ··· 1981 * 1982 * This is a library function, which implements the writepages() 1983 * address_space_operation. 1984 */ 1985 static int mpage_da_writepages(struct address_space *mapping, 1986 struct writeback_control *wbc, 1987 get_block_t get_block) 1988 { 1989 struct mpage_da_data mpd; 1990 + long to_write; 1991 int ret; 1992 1993 if (!get_block) ··· 2010 mpd.first_page = 0; 2011 mpd.next_page = 0; 2012 mpd.get_block = get_block; 2013 + mpd.io_done = 0; 2014 + mpd.pages_written = 0; 2015 + 2016 + to_write = wbc->nr_to_write; 2017 2018 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2019 2020 /* 2021 * Handle last extent of pages 2022 */ 2023 + if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2024 mpage_da_map_blocks(&mpd); 2025 mpage_da_submit_io(&mpd); 2026 } 2027 2028 + wbc->nr_to_write = to_write - mpd.pages_written; 2029 return ret; 2030 } 2031 ··· 2238 #define EXT4_MAX_WRITEBACK_CREDITS 25 2239 2240 static int ext4_da_writepages(struct address_space *mapping, 2241 + struct writeback_control *wbc) 2242 { 2243 struct inode *inode = mapping->host; 2244 handle_t *handle = NULL; ··· 2246 int ret = 0; 2247 long to_write; 2248 loff_t range_start = 0; 2249 + long pages_skipped = 0; 2250 2251 /* 2252 * No pages to write? This is mainly a kludge to avoid starting 2253 * a transaction for special inodes like journal inode on last iput() 2254 * because that could violate lock ordering on umount 2255 */ 2256 + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2257 return 0; 2258 2259 + if (!wbc->range_cyclic) 2260 /* 2261 * If range_cyclic is not set force range_cont 2262 * and save the old writeback_index 2263 */ 2264 wbc->range_cont = 1; 2265 2266 + range_start = wbc->range_start; 2267 + pages_skipped = wbc->pages_skipped; 2268 + 2269 + restart_loop: 2270 + to_write = wbc->nr_to_write; 2271 + while (!ret && to_write > 0) { 2272 + 2273 + /* 2274 + * we insert one extent at a time. So we need 2275 + * credit needed for single extent allocation. 2276 + * journalled mode is currently not supported 2277 + * by delalloc 2278 + */ 2279 + BUG_ON(ext4_should_journal_data(inode)); 2280 + needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); 2281 + 2282 /* start a new transaction*/ 2283 handle = ext4_journal_start(inode, needed_blocks); 2284 if (IS_ERR(handle)) { 2285 ret = PTR_ERR(handle); 2286 + printk(KERN_EMERG "%s: jbd2_start: " 2287 + "%ld pages, ino %lu; err %d\n", __func__, 2288 + wbc->nr_to_write, inode->i_ino, ret); 2289 + dump_stack(); 2290 goto out_writepages; 2291 } 2292 if (ext4_should_order_data(inode)) { 2293 /* 2294 * With ordered mode we need to add 2295 + * the inode to the journal handl 2296 * when we do block allocation. 2297 */ 2298 ret = ext4_jbd2_file_inode(handle, inode); ··· 2289 ext4_journal_stop(handle); 2290 goto out_writepages; 2291 } 2292 } 2293 2294 to_write -= wbc->nr_to_write; 2295 ret = mpage_da_writepages(mapping, wbc, 2296 + ext4_da_get_block_write); 2297 ext4_journal_stop(handle); 2298 + if (ret == MPAGE_DA_EXTENT_TAIL) { 2299 + /* 2300 + * got one extent now try with 2301 + * rest of the pages 2302 + */ 2303 + to_write += wbc->nr_to_write; 2304 + ret = 0; 2305 + } else if (wbc->nr_to_write) { 2306 /* 2307 * There is no more writeout needed 2308 * or we requested for a noblocking writeout ··· 2314 wbc->nr_to_write = to_write; 2315 } 2316 2317 + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { 2318 + /* We skipped pages in this loop */ 2319 + wbc->range_start = range_start; 2320 + wbc->nr_to_write = to_write + 2321 + wbc->pages_skipped - pages_skipped; 2322 + wbc->pages_skipped = pages_skipped; 2323 + goto restart_loop; 2324 + } 2325 + 2326 out_writepages: 2327 wbc->nr_to_write = to_write; 2328 + wbc->range_start = range_start; 2329 return ret; 2330 } 2331