ext4: Rework the ext4_da_writepages() function

With the below changes we reserve credit needed to insert only one
extent resulting from a call to single get_block. This makes sure we
don't take too much journal credits during writeout. We also don't
limit the pages to write. That means we loop through the dirty pages
building largest possible contiguous block request. Then we issue a
single get_block request. We may get less block that we requested. If
so we would end up not mapping some of the buffer_heads. That means
those buffer_heads are still marked delay. Later in the writepage
callback via __mpage_writepage we redirty those pages.

We should also not limit/throttle wbc->nr_to_write in the filesystem
writepages callback. That cause wrong behaviour in
generic_sync_sb_inodes caused by wbc->nr_to_write being <= 0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Aneesh Kumar K.V and committed by Theodore Ts'o a1d6cc56 f3bd1f3f

+113 -88
+113 -88
fs/ext4/inode.c
··· 41 41 #include "acl.h" 42 42 #include "ext4_extents.h" 43 43 44 + #define MPAGE_DA_EXTENT_TAIL 0x01 45 + 44 46 static inline int ext4_begin_ordered_truncate(struct inode *inode, 45 47 loff_t new_size) 46 48 { ··· 1628 1626 unsigned long first_page, next_page; /* extent of pages */ 1629 1627 get_block_t *get_block; 1630 1628 struct writeback_control *wbc; 1629 + int io_done; 1630 + long pages_written; 1631 1631 }; 1632 1632 1633 1633 /* 1634 1634 * mpage_da_submit_io - walks through extent of pages and try to write 1635 - * them with __mpage_writepage() 1635 + * them with writepage() call back 1636 1636 * 1637 1637 * @mpd->inode: inode 1638 1638 * @mpd->first_page: first page of the extent ··· 1649 1645 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1650 1646 { 1651 1647 struct address_space *mapping = mpd->inode->i_mapping; 1652 - struct mpage_data mpd_pp = { 1653 - .bio = NULL, 1654 - .last_block_in_bio = 0, 1655 - .get_block = mpd->get_block, 1656 - .use_writepage = 1, 1657 - }; 1658 1648 int ret = 0, err, nr_pages, i; 1659 1649 unsigned long index, end; 1660 1650 struct pagevec pvec; 1661 1651 1662 1652 BUG_ON(mpd->next_page <= mpd->first_page); 1663 - 1664 1653 pagevec_init(&pvec, 0); 1665 1654 index = mpd->first_page; 1666 1655 end = mpd->next_page - 1; ··· 1671 1674 break; 1672 1675 index++; 1673 1676 1674 - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); 1675 - 1677 + err = mapping->a_ops->writepage(page, mpd->wbc); 1678 + if (!err) 1679 + mpd->pages_written++; 1676 1680 /* 1677 1681 * In error case, we have to continue because 1678 1682 * remaining pages are still locked ··· 1684 1686 } 1685 1687 pagevec_release(&pvec); 1686 1688 } 1687 - if (mpd_pp.bio) 1688 - mpage_bio_submit(WRITE, mpd_pp.bio); 1689 - 1690 1689 return ret; 1691 1690 } 1692 1691 ··· 1706 1711 int blocks = exbh->b_size >> inode->i_blkbits; 1707 1712 sector_t pblock = exbh->b_blocknr, cur_logical; 1708 1713 struct buffer_head *head, *bh; 1709 - unsigned long index, end; 1714 + pgoff_t index, end; 1710 1715 struct pagevec pvec; 1711 1716 int nr_pages, i; 1712 1717 ··· 1791 1796 * 1792 1797 * The function skips space we know is already mapped to disk blocks. 1793 1798 * 1794 - * The function ignores errors ->get_block() returns, thus real 1795 - * error handling is postponed to __mpage_writepage() 1796 1799 */ 1797 1800 static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1798 1801 { 1802 + int err = 0; 1799 1803 struct buffer_head *lbh = &mpd->lbh; 1800 - int err = 0, remain = lbh->b_size; 1801 1804 sector_t next = lbh->b_blocknr; 1802 1805 struct buffer_head new; 1803 1806 ··· 1805 1812 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1806 1813 return; 1807 1814 1808 - while (remain) { 1809 - new.b_state = lbh->b_state; 1810 - new.b_blocknr = 0; 1811 - new.b_size = remain; 1812 - err = mpd->get_block(mpd->inode, next, &new, 1); 1813 - if (err) { 1814 - /* 1815 - * Rather than implement own error handling 1816 - * here, we just leave remaining blocks 1817 - * unallocated and try again with ->writepage() 1818 - */ 1819 - break; 1820 - } 1821 - BUG_ON(new.b_size == 0); 1815 + new.b_state = lbh->b_state; 1816 + new.b_blocknr = 0; 1817 + new.b_size = lbh->b_size; 1822 1818 1823 - if (buffer_new(&new)) 1824 - __unmap_underlying_blocks(mpd->inode, &new); 1819 + /* 1820 + * If we didn't accumulate anything 1821 + * to write simply return 1822 + */ 1823 + if (!new.b_size) 1824 + return; 1825 + err = mpd->get_block(mpd->inode, next, &new, 1); 1826 + if (err) 1827 + return; 1828 + BUG_ON(new.b_size == 0); 1825 1829 1826 - /* 1827 - * If blocks are delayed marked, we need to 1828 - * put actual blocknr and drop delayed bit 1829 - */ 1830 - if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1831 - mpage_put_bnr_to_bhs(mpd, next, &new); 1830 + if (buffer_new(&new)) 1831 + __unmap_underlying_blocks(mpd->inode, &new); 1832 1832 1833 - /* go for the remaining blocks */ 1834 - next += new.b_size >> mpd->inode->i_blkbits; 1835 - remain -= new.b_size; 1836 - } 1833 + /* 1834 + * If blocks are delayed marked, we need to 1835 + * put actual blocknr and drop delayed bit 1836 + */ 1837 + if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1838 + mpage_put_bnr_to_bhs(mpd, next, &new); 1839 + 1840 + return; 1837 1841 } 1838 1842 1839 1843 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ ··· 1876 1886 * need to flush current extent and start new one 1877 1887 */ 1878 1888 mpage_da_map_blocks(mpd); 1879 - 1880 - /* 1881 - * Now start a new extent 1882 - */ 1883 - lbh->b_size = bh->b_size; 1884 - lbh->b_state = bh->b_state & BH_FLAGS; 1885 - lbh->b_blocknr = logical; 1889 + mpage_da_submit_io(mpd); 1890 + mpd->io_done = 1; 1891 + return; 1886 1892 } 1887 1893 1888 1894 /* ··· 1898 1912 struct buffer_head *bh, *head, fake; 1899 1913 sector_t logical; 1900 1914 1915 + if (mpd->io_done) { 1916 + /* 1917 + * Rest of the page in the page_vec 1918 + * redirty then and skip then. We will 1919 + * try to to write them again after 1920 + * starting a new transaction 1921 + */ 1922 + redirty_page_for_writepage(wbc, page); 1923 + unlock_page(page); 1924 + return MPAGE_DA_EXTENT_TAIL; 1925 + } 1901 1926 /* 1902 1927 * Can we merge this page to current extent? 1903 1928 */ 1904 1929 if (mpd->next_page != page->index) { 1905 1930 /* 1906 1931 * Nope, we can't. So, we map non-allocated blocks 1907 - * and start IO on them using __mpage_writepage() 1932 + * and start IO on them using writepage() 1908 1933 */ 1909 1934 if (mpd->next_page != mpd->first_page) { 1910 1935 mpage_da_map_blocks(mpd); 1911 1936 mpage_da_submit_io(mpd); 1937 + /* 1938 + * skip rest of the page in the page_vec 1939 + */ 1940 + mpd->io_done = 1; 1941 + redirty_page_for_writepage(wbc, page); 1942 + unlock_page(page); 1943 + return MPAGE_DA_EXTENT_TAIL; 1912 1944 } 1913 1945 1914 1946 /* ··· 1957 1953 set_buffer_dirty(bh); 1958 1954 set_buffer_uptodate(bh); 1959 1955 mpage_add_bh_to_extent(mpd, logical, bh); 1956 + if (mpd->io_done) 1957 + return MPAGE_DA_EXTENT_TAIL; 1960 1958 } else { 1961 1959 /* 1962 1960 * Page with regular buffer heads, just add all dirty ones ··· 1967 1961 bh = head; 1968 1962 do { 1969 1963 BUG_ON(buffer_locked(bh)); 1970 - if (buffer_dirty(bh)) 1964 + if (buffer_dirty(bh) && 1965 + (!buffer_mapped(bh) || buffer_delay(bh))) { 1971 1966 mpage_add_bh_to_extent(mpd, logical, bh); 1967 + if (mpd->io_done) 1968 + return MPAGE_DA_EXTENT_TAIL; 1969 + } 1972 1970 logical++; 1973 1971 } while ((bh = bh->b_this_page) != head); 1974 1972 } ··· 1991 1981 * 1992 1982 * This is a library function, which implements the writepages() 1993 1983 * address_space_operation. 1994 - * 1995 - * In order to avoid duplication of logic that deals with partial pages, 1996 - * multiple bio per page, etc, we find non-allocated blocks, allocate 1997 - * them with minimal calls to ->get_block() and re-use __mpage_writepage() 1998 - * 1999 - * It's important that we call __mpage_writepage() only once for each 2000 - * involved page, otherwise we'd have to implement more complicated logic 2001 - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. 2002 - * 2003 - * See comments to mpage_writepages() 2004 1984 */ 2005 1985 static int mpage_da_writepages(struct address_space *mapping, 2006 1986 struct writeback_control *wbc, 2007 1987 get_block_t get_block) 2008 1988 { 2009 1989 struct mpage_da_data mpd; 1990 + long to_write; 2010 1991 int ret; 2011 1992 2012 1993 if (!get_block) ··· 2011 2010 mpd.first_page = 0; 2012 2011 mpd.next_page = 0; 2013 2012 mpd.get_block = get_block; 2013 + mpd.io_done = 0; 2014 + mpd.pages_written = 0; 2015 + 2016 + to_write = wbc->nr_to_write; 2014 2017 2015 2018 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2016 2019 2017 2020 /* 2018 2021 * Handle last extent of pages 2019 2022 */ 2020 - if (mpd.next_page != mpd.first_page) { 2023 + if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2021 2024 mpage_da_map_blocks(&mpd); 2022 2025 mpage_da_submit_io(&mpd); 2023 2026 } 2024 2027 2028 + wbc->nr_to_write = to_write - mpd.pages_written; 2025 2029 return ret; 2026 2030 } 2027 2031 ··· 2244 2238 #define EXT4_MAX_WRITEBACK_CREDITS 25 2245 2239 2246 2240 static int ext4_da_writepages(struct address_space *mapping, 2247 - struct writeback_control *wbc) 2241 + struct writeback_control *wbc) 2248 2242 { 2249 2243 struct inode *inode = mapping->host; 2250 2244 handle_t *handle = NULL; ··· 2252 2246 int ret = 0; 2253 2247 long to_write; 2254 2248 loff_t range_start = 0; 2249 + long pages_skipped = 0; 2255 2250 2256 2251 /* 2257 2252 * No pages to write? This is mainly a kludge to avoid starting 2258 2253 * a transaction for special inodes like journal inode on last iput() 2259 2254 * because that could violate lock ordering on umount 2260 2255 */ 2261 - if (!mapping->nrpages) 2256 + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2262 2257 return 0; 2263 2258 2264 - /* 2265 - * Estimate the worse case needed credits to write out 2266 - * EXT4_MAX_BUF_BLOCKS pages 2267 - */ 2268 - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; 2269 - 2270 - to_write = wbc->nr_to_write; 2271 - if (!wbc->range_cyclic) { 2259 + if (!wbc->range_cyclic) 2272 2260 /* 2273 2261 * If range_cyclic is not set force range_cont 2274 2262 * and save the old writeback_index 2275 2263 */ 2276 2264 wbc->range_cont = 1; 2277 - range_start = wbc->range_start; 2278 - } 2279 2265 2280 - while (!ret && to_write) { 2266 + range_start = wbc->range_start; 2267 + pages_skipped = wbc->pages_skipped; 2268 + 2269 + restart_loop: 2270 + to_write = wbc->nr_to_write; 2271 + while (!ret && to_write > 0) { 2272 + 2273 + /* 2274 + * we insert one extent at a time. So we need 2275 + * credit needed for single extent allocation. 2276 + * journalled mode is currently not supported 2277 + * by delalloc 2278 + */ 2279 + BUG_ON(ext4_should_journal_data(inode)); 2280 + needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); 2281 + 2281 2282 /* start a new transaction*/ 2282 2283 handle = ext4_journal_start(inode, needed_blocks); 2283 2284 if (IS_ERR(handle)) { 2284 2285 ret = PTR_ERR(handle); 2286 + printk(KERN_EMERG "%s: jbd2_start: " 2287 + "%ld pages, ino %lu; err %d\n", __func__, 2288 + wbc->nr_to_write, inode->i_ino, ret); 2289 + dump_stack(); 2285 2290 goto out_writepages; 2286 2291 } 2287 2292 if (ext4_should_order_data(inode)) { 2288 2293 /* 2289 2294 * With ordered mode we need to add 2290 - * the inode to the journal handle 2295 + * the inode to the journal handl 2291 2296 * when we do block allocation. 2292 2297 */ 2293 2298 ret = ext4_jbd2_file_inode(handle, inode); ··· 2306 2289 ext4_journal_stop(handle); 2307 2290 goto out_writepages; 2308 2291 } 2309 - 2310 2292 } 2311 - /* 2312 - * set the max dirty pages could be write at a time 2313 - * to fit into the reserved transaction credits 2314 - */ 2315 - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) 2316 - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; 2317 2293 2318 2294 to_write -= wbc->nr_to_write; 2319 2295 ret = mpage_da_writepages(mapping, wbc, 2320 - ext4_da_get_block_write); 2296 + ext4_da_get_block_write); 2321 2297 ext4_journal_stop(handle); 2322 - if (wbc->nr_to_write) { 2298 + if (ret == MPAGE_DA_EXTENT_TAIL) { 2299 + /* 2300 + * got one extent now try with 2301 + * rest of the pages 2302 + */ 2303 + to_write += wbc->nr_to_write; 2304 + ret = 0; 2305 + } else if (wbc->nr_to_write) { 2323 2306 /* 2324 2307 * There is no more writeout needed 2325 2308 * or we requested for a noblocking writeout ··· 2331 2314 wbc->nr_to_write = to_write; 2332 2315 } 2333 2316 2317 + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { 2318 + /* We skipped pages in this loop */ 2319 + wbc->range_start = range_start; 2320 + wbc->nr_to_write = to_write + 2321 + wbc->pages_skipped - pages_skipped; 2322 + wbc->pages_skipped = pages_skipped; 2323 + goto restart_loop; 2324 + } 2325 + 2334 2326 out_writepages: 2335 2327 wbc->nr_to_write = to_write; 2336 - if (range_start) 2337 - wbc->range_start = range_start; 2328 + wbc->range_start = range_start; 2338 2329 return ret; 2339 2330 } 2340 2331