Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge patch series "allow file systems to increase the minimum writeback chunk size v2"

Christoph Hellwig <hch@lst.de> says:

The relatively low minimal writeback size of 4MiB leads means that
written back inodes on rotational media are switched a lot. Besides
introducing additional seeks, this also can lead to extreme file
fragmentation on zoned devices when a lot of files are cached relative
to the available writeback bandwidth.

Add a superblock field that allows the file system to override the
default size, and set it to the zone size for zoned XFS.

* patches from https://patch.msgid.link/20251017034611.651385-1-hch@lst.de:
xfs: set s_min_writeback_pages for zoned file systems
writeback: allow the file system to override MIN_WRITEBACK_PAGES
writeback: cleanup writeback_chunk_size

Link: https://patch.msgid.link/20251017034611.651385-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>

+42 -19
+9 -17
fs/fs-writeback.c
··· 33 33 #include "internal.h" 34 34 35 35 /* 36 - * 4MB minimal write chunk size 37 - */ 38 - #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) 39 - 40 - /* 41 36 * Passed into wb_writeback(), essentially a subset of writeback_control 42 37 */ 43 38 struct wb_writeback_work { ··· 1884 1889 return ret; 1885 1890 } 1886 1891 1887 - static long writeback_chunk_size(struct bdi_writeback *wb, 1888 - struct wb_writeback_work *work) 1892 + static long writeback_chunk_size(struct super_block *sb, 1893 + struct bdi_writeback *wb, struct wb_writeback_work *work) 1889 1894 { 1890 1895 long pages; 1891 1896 ··· 1903 1908 * (maybe slowly) sync all tagged pages 1904 1909 */ 1905 1910 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 1906 - pages = LONG_MAX; 1907 - else { 1908 - pages = min(wb->avg_write_bandwidth / 2, 1909 - global_wb_domain.dirty_limit / DIRTY_SCOPE); 1910 - pages = min(pages, work->nr_pages); 1911 - pages = round_down(pages + MIN_WRITEBACK_PAGES, 1912 - MIN_WRITEBACK_PAGES); 1913 - } 1911 + return LONG_MAX; 1914 1912 1915 - return pages; 1913 + pages = min(wb->avg_write_bandwidth / 2, 1914 + global_wb_domain.dirty_limit / DIRTY_SCOPE); 1915 + pages = min(pages, work->nr_pages); 1916 + return round_down(pages + sb->s_min_writeback_pages, 1917 + sb->s_min_writeback_pages); 1916 1918 } 1917 1919 1918 1920 /* ··· 2011 2019 inode->i_state |= I_SYNC; 2012 2020 wbc_attach_and_unlock_inode(&wbc, inode); 2013 2021 2014 - write_chunk = writeback_chunk_size(wb, work); 2022 + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); 2015 2023 wbc.nr_to_write = write_chunk; 2016 2024 wbc.pages_skipped = 0; 2017 2025
+1
fs/super.c
··· 389 389 goto fail; 390 390 if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) 391 391 goto fail; 392 + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; 392 393 return s; 393 394 394 395 fail:
+26 -2
fs/xfs/xfs_zone_alloc.c
··· 1215 1215 .mp = mp, 1216 1216 }; 1217 1217 struct xfs_buftarg *bt = mp->m_rtdev_targp; 1218 + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; 1218 1219 int error; 1219 1220 1220 1221 if (!bt) { ··· 1246 1245 return -ENOMEM; 1247 1246 1248 1247 xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1249 - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, 1250 - mp->m_max_open_zones); 1248 + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); 1251 1249 trace_xfs_zones_mount(mp); 1250 + 1251 + /* 1252 + * The writeback code switches between inodes regularly to provide 1253 + * fairness. The default lower bound is 4MiB, but for zoned file 1254 + * systems we want to increase that both to reduce seeks, but also more 1255 + * importantly so that workloads that writes files in a multiple of the 1256 + * zone size do not get fragmented and require garbage collection when 1257 + * they shouldn't. Increase is to the zone size capped by the max 1258 + * extent len. 1259 + * 1260 + * Note that because s_min_writeback_pages is a superblock field, this 1261 + * value also get applied to non-zoned files on the data device if 1262 + * there are any. On typical zoned setup all data is on the RT device 1263 + * because using the more efficient sequential write required zones 1264 + * is the reason for using the zone allocator, and either the RT device 1265 + * and the (meta)data device are on the same block device, or the 1266 + * (meta)data device is on a fast SSD while the data on the RT device 1267 + * is on a SMR HDD. In any combination of the above cases enforcing 1268 + * the higher min_writeback_pages for non-RT inodes is either a noop 1269 + * or beneficial. 1270 + */ 1271 + mp->m_super->s_min_writeback_pages = 1272 + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> 1273 + PAGE_SHIFT; 1252 1274 1253 1275 if (bdev_is_zoned(bt->bt_bdev)) { 1254 1276 error = blkdev_report_zones(bt->bt_bdev,
+1
include/linux/fs.h
··· 1583 1583 1584 1584 spinlock_t s_inode_wblist_lock; 1585 1585 struct list_head s_inodes_wb; /* writeback inodes */ 1586 + long s_min_writeback_pages; 1586 1587 } __randomize_layout; 1587 1588 1588 1589 static inline struct user_namespace *i_user_ns(const struct inode *inode)
+5
include/linux/writeback.h
··· 374 374 void sb_mark_inode_writeback(struct inode *inode); 375 375 void sb_clear_inode_writeback(struct inode *inode); 376 376 377 + /* 378 + * 4MB minimal write chunk size 379 + */ 380 + #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) 381 + 377 382 #endif /* WRITEBACK_H */