Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: introduce seq counter for the extent status entry

In the iomap_write_iter(), the iomap buffered write frame does not hold
any locks between querying the inode extent mapping info and performing
page cache writes. As a result, the extent mapping can be changed due to
concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the
write-back process faces a similar problem: concurrent changes can
invalidate the extent mapping before the I/O is submitted.

Therefore, both of these processes must recheck the mapping info after
acquiring the folio lock. To address this, similar to XFS, we propose
introducing an extent sequence number to serve as a validity cookie for
the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for
modifying extents"), we can ensure the extent information should always
be processed through the extent status tree, and the extent status tree
is always uptodate under i_rwsem or invalidate_lock or folio lock, so
it's safe to introduce this sequence number. The sequence number will be
increased whenever the extent status tree changes, preparing for the
buffered write iomap conversion.

Besides, this mechanism is also applicable for the moving extents case.
In move_extent_per_page(), it also needs to reacquire data_sem and check
the mapping info again under the folio lock.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-3-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Zhang Yi and committed by
Theodore Ts'o
dd064d51 a2e5a3ce

+39 -12
+2
fs/ext4/ext4.h
··· 1138 1138 ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for 1139 1139 extents to shrink. Protected by 1140 1140 i_es_lock */ 1141 + u64 i_es_seq; /* Change counter for extents. 1142 + Protected by i_es_lock */ 1141 1143 1142 1144 /* ialloc */ 1143 1145 ext4_group_t i_last_alloc_group;
+21 -4
fs/ext4/extents_status.c
··· 235 235 return es->es_lblk + es->es_len - 1; 236 236 } 237 237 238 + static inline void ext4_es_inc_seq(struct inode *inode) 239 + { 240 + struct ext4_inode_info *ei = EXT4_I(inode); 241 + 242 + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); 243 + } 244 + 238 245 /* 239 246 * search through the tree for an delayed extent with a given offset. If 240 247 * it can't be found, try to find next extent. ··· 913 906 newes.es_lblk = lblk; 914 907 newes.es_len = len; 915 908 ext4_es_store_pblock_status(&newes, pblk, status); 916 - trace_ext4_es_insert_extent(inode, &newes); 917 909 918 910 ext4_es_insert_extent_check(inode, &newes); 919 911 ··· 961 955 } 962 956 pending = err3; 963 957 } 958 + /* 959 + * TODO: For cache on-disk extents, there is no need to increment 960 + * the sequence counter, this requires future optimization. 961 + */ 962 + ext4_es_inc_seq(inode); 964 963 error: 965 964 write_unlock(&EXT4_I(inode)->i_es_lock); 966 965 /* ··· 992 981 if (err1 || err2 || err3 < 0) 993 982 goto retry; 994 983 984 + trace_ext4_es_insert_extent(inode, &newes); 995 985 ext4_es_print_tree(inode); 996 986 return; 997 987 } ··· 1562 1550 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1563 1551 return; 1564 1552 1565 - trace_ext4_es_remove_extent(inode, lblk, len); 1566 1553 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 1567 1554 lblk, len, inode->i_ino); 1568 1555 ··· 1581 1570 */ 1582 1571 write_lock(&EXT4_I(inode)->i_es_lock); 1583 1572 err = __es_remove_extent(inode, lblk, end, &reserved, es); 1573 + if (err) 1574 + goto error; 1584 1575 /* Free preallocated extent if it didn't get used. */ 1585 1576 if (es) { 1586 1577 if (!es->es_len) 1587 1578 __es_free_extent(es); 1588 1579 es = NULL; 1589 1580 } 1581 + ext4_es_inc_seq(inode); 1582 + error: 1590 1583 write_unlock(&EXT4_I(inode)->i_es_lock); 1591 1584 if (err) 1592 1585 goto retry; 1593 1586 1587 + trace_ext4_es_remove_extent(inode, lblk, len); 1594 1588 ext4_es_print_tree(inode); 1595 1589 ext4_da_release_space(inode, reserved); 1596 1590 } ··· 2156 2140 newes.es_lblk = lblk; 2157 2141 newes.es_len = len; 2158 2142 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); 2159 - trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, 2160 - end_allocated); 2161 2143 2162 2144 ext4_es_insert_extent_check(inode, &newes); 2163 2145 ··· 2210 2196 pr2 = NULL; 2211 2197 } 2212 2198 } 2199 + ext4_es_inc_seq(inode); 2213 2200 error: 2214 2201 write_unlock(&EXT4_I(inode)->i_es_lock); 2215 2202 if (err1 || err2 || err3 < 0) 2216 2203 goto retry; 2217 2204 2205 + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, 2206 + end_allocated); 2218 2207 ext4_es_print_tree(inode); 2219 2208 ext4_print_pending_tree(inode); 2220 2209 return;
+1
fs/ext4/super.c
··· 1406 1406 ei->i_es_all_nr = 0; 1407 1407 ei->i_es_shk_nr = 0; 1408 1408 ei->i_es_shrink_lblk = 0; 1409 + ei->i_es_seq = 0; 1409 1410 ei->i_reserved_data_blocks = 0; 1410 1411 spin_lock_init(&(ei->i_block_reservation_lock)); 1411 1412 ext4_init_pending_tree(&ei->i_pending_tree);
+15 -8
include/trace/events/ext4.h
··· 2210 2210 __field( ext4_lblk_t, lblk ) 2211 2211 __field( ext4_lblk_t, len ) 2212 2212 __field( ext4_fsblk_t, pblk ) 2213 - __field( char, status ) 2213 + __field( char, status ) 2214 + __field( u64, seq ) 2214 2215 ), 2215 2216 2216 2217 TP_fast_assign( ··· 2221 2220 __entry->len = es->es_len; 2222 2221 __entry->pblk = ext4_es_show_pblock(es); 2223 2222 __entry->status = ext4_es_status(es); 2223 + __entry->seq = EXT4_I(inode)->i_es_seq; 2224 2224 ), 2225 2225 2226 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", 2226 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu", 2227 2227 MAJOR(__entry->dev), MINOR(__entry->dev), 2228 2228 (unsigned long) __entry->ino, 2229 2229 __entry->lblk, __entry->len, 2230 - __entry->pblk, show_extent_status(__entry->status)) 2230 + __entry->pblk, show_extent_status(__entry->status), 2231 + __entry->seq) 2231 2232 ); 2232 2233 2233 2234 DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, ··· 2254 2251 __field( ino_t, ino ) 2255 2252 __field( loff_t, lblk ) 2256 2253 __field( loff_t, len ) 2254 + __field( u64, seq ) 2257 2255 ), 2258 2256 2259 2257 TP_fast_assign( ··· 2262 2258 __entry->ino = inode->i_ino; 2263 2259 __entry->lblk = lblk; 2264 2260 __entry->len = len; 2261 + __entry->seq = EXT4_I(inode)->i_es_seq; 2265 2262 ), 2266 2263 2267 - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2264 + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu", 2268 2265 MAJOR(__entry->dev), MINOR(__entry->dev), 2269 2266 (unsigned long) __entry->ino, 2270 - __entry->lblk, __entry->len) 2267 + __entry->lblk, __entry->len, __entry->seq) 2271 2268 ); 2272 2269 2273 2270 TRACE_EVENT(ext4_es_find_extent_range_enter, ··· 2528 2523 __field( char, status ) 2529 2524 __field( bool, lclu_allocated ) 2530 2525 __field( bool, end_allocated ) 2526 + __field( u64, seq ) 2531 2527 ), 2532 2528 2533 2529 TP_fast_assign( ··· 2540 2534 __entry->status = ext4_es_status(es); 2541 2535 __entry->lclu_allocated = lclu_allocated; 2542 2536 __entry->end_allocated = end_allocated; 2537 + __entry->seq = EXT4_I(inode)->i_es_seq; 2543 2538 ), 2544 2539 2545 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " 2546 - "allocated %d %d", 2540 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", 2547 2541 MAJOR(__entry->dev), MINOR(__entry->dev), 2548 2542 (unsigned long) __entry->ino, 2549 2543 __entry->lblk, __entry->len, 2550 2544 __entry->pblk, show_extent_status(__entry->status), 2551 - __entry->lclu_allocated, __entry->end_allocated) 2545 + __entry->lclu_allocated, __entry->end_allocated, 2546 + __entry->seq) 2552 2547 ); 2553 2548 2554 2549 /* fsmap traces */