Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: lookup block mapping in extent status tree

After tracking all extent status, we already have a extent cache in
memory. Every time we want to lookup a block mapping, we can first
try to lookup it in extent status tree to avoid a potential disk I/O.

A new function called ext4_es_lookup_extent is defined to finish this
work. When we try to lookup a block mapping, we always call
ext4_map_blocks and/or ext4_da_map_blocks. So in these functions we
first try to lookup a block mapping in extent status tree.

A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks
in order not to put a hole into extent status tree because this hole
will be converted to delayed extent in the tree immediately.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan kara <jack@suse.cz>

authored by

Zheng Liu and committed by
Theodore Ts'o
d100eef2 f7fec032

+192 -3
+2
fs/ext4/ext4.h
··· 579 579 #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 580 580 /* Do not take i_data_sem locking in ext4_map_blocks */ 581 581 #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 582 + /* Do not put hole in extent cache */ 583 + #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 582 584 583 585 /* 584 586 * Flags used by ext4_free_blocks
+8 -1
fs/ext4/extents.c
··· 2167 2167 block, 2168 2168 le32_to_cpu(ex->ee_block), 2169 2169 ext4_ext_get_actual_len(ex)); 2170 + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2171 + ext4_es_insert_extent(inode, lblock, len, ~0, 2172 + EXTENT_STATUS_HOLE); 2170 2173 } else if (block >= le32_to_cpu(ex->ee_block) 2171 2174 + ext4_ext_get_actual_len(ex)) { 2172 2175 ext4_lblk_t next; ··· 2183 2180 block); 2184 2181 BUG_ON(next == lblock); 2185 2182 len = next - lblock; 2183 + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2184 + ext4_es_insert_extent(inode, lblock, len, ~0, 2185 + EXTENT_STATUS_HOLE); 2186 2186 } else { 2187 2187 lblock = len = 0; 2188 2188 BUG(); ··· 4024 4018 * put just found gap into cache to speed up 4025 4019 * subsequent requests 4026 4020 */ 4027 - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4021 + if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) 4022 + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4028 4023 goto out2; 4029 4024 } 4030 4025
+60
fs/ext4/extents_status.c
··· 461 461 return err; 462 462 } 463 463 464 + /* 465 + * ext4_es_lookup_extent() looks up an extent in extent status tree. 466 + * 467 + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. 468 + * 469 + * Return: 1 on found, 0 on not 470 + */ 471 + int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 472 + struct extent_status *es) 473 + { 474 + struct ext4_es_tree *tree; 475 + struct extent_status *es1 = NULL; 476 + struct rb_node *node; 477 + int found = 0; 478 + 479 + trace_ext4_es_lookup_extent_enter(inode, lblk); 480 + es_debug("lookup extent in block %u\n", lblk); 481 + 482 + tree = &EXT4_I(inode)->i_es_tree; 483 + read_lock(&EXT4_I(inode)->i_es_lock); 484 + 485 + /* find extent in cache firstly */ 486 + es->es_lblk = es->es_len = es->es_pblk = 0; 487 + if (tree->cache_es) { 488 + es1 = tree->cache_es; 489 + if (in_range(lblk, es1->es_lblk, es1->es_len)) { 490 + es_debug("%u cached by [%u/%u)\n", 491 + lblk, es1->es_lblk, es1->es_len); 492 + found = 1; 493 + goto out; 494 + } 495 + } 496 + 497 + node = tree->root.rb_node; 498 + while (node) { 499 + es1 = rb_entry(node, struct extent_status, rb_node); 500 + if (lblk < es1->es_lblk) 501 + node = node->rb_left; 502 + else if (lblk > ext4_es_end(es1)) 503 + node = node->rb_right; 504 + else { 505 + found = 1; 506 + break; 507 + } 508 + } 509 + 510 + out: 511 + if (found) { 512 + BUG_ON(!es1); 513 + es->es_lblk = es1->es_lblk; 514 + es->es_len = es1->es_len; 515 + es->es_pblk = es1->es_pblk; 516 + } 517 + 518 + read_unlock(&EXT4_I(inode)->i_es_lock); 519 + 520 + trace_ext4_es_lookup_extent_exit(inode, es, found); 521 + return found; 522 + } 523 + 464 524 static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, 465 525 ext4_lblk_t end) 466 526 {
+2
fs/ext4/extents_status.h
··· 53 53 ext4_lblk_t len); 54 54 extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, 55 55 struct extent_status *es); 56 + extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 57 + struct extent_status *es); 56 58 57 59 static inline int ext4_es_is_written(struct extent_status *es) 58 60 {
+64 -2
fs/ext4/inode.c
··· 507 507 int ext4_map_blocks(handle_t *handle, struct inode *inode, 508 508 struct ext4_map_blocks *map, int flags) 509 509 { 510 + struct extent_status es; 510 511 int retval; 511 512 512 513 map->m_flags = 0; 513 514 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 514 515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 515 516 (unsigned long) map->m_lblk); 517 + 518 + /* Lookup extent status tree firstly */ 519 + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 520 + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 521 + map->m_pblk = ext4_es_pblock(&es) + 522 + map->m_lblk - es.es_lblk; 523 + map->m_flags |= ext4_es_is_written(&es) ? 524 + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; 525 + retval = es.es_len - (map->m_lblk - es.es_lblk); 526 + if (retval > map->m_len) 527 + retval = map->m_len; 528 + map->m_len = retval; 529 + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 530 + retval = 0; 531 + } else { 532 + BUG_ON(1); 533 + } 534 + goto found; 535 + } 536 + 516 537 /* 517 538 * Try to see if we can get the block without requesting a new 518 539 * file system block. ··· 565 544 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 566 545 up_read((&EXT4_I(inode)->i_data_sem)); 567 546 547 + found: 568 548 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 569 549 int ret = check_block_validity(inode, map); 570 550 if (ret != 0) ··· 1765 1743 struct ext4_map_blocks *map, 1766 1744 struct buffer_head *bh) 1767 1745 { 1746 + struct extent_status es; 1768 1747 int retval; 1769 1748 sector_t invalid_block = ~((sector_t) 0xffff); 1770 1749 ··· 1776 1753 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," 1777 1754 "logical block %lu\n", inode->i_ino, map->m_len, 1778 1755 (unsigned long) map->m_lblk); 1756 + 1757 + /* Lookup extent status tree firstly */ 1758 + if (ext4_es_lookup_extent(inode, iblock, &es)) { 1759 + 1760 + if (ext4_es_is_hole(&es)) { 1761 + retval = 0; 1762 + down_read((&EXT4_I(inode)->i_data_sem)); 1763 + goto add_delayed; 1764 + } 1765 + 1766 + /* 1767 + * Delayed extent could be allocated by fallocate. 1768 + * So we need to check it. 1769 + */ 1770 + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { 1771 + map_bh(bh, inode->i_sb, invalid_block); 1772 + set_buffer_new(bh); 1773 + set_buffer_delay(bh); 1774 + return 0; 1775 + } 1776 + 1777 + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; 1778 + retval = es.es_len - (iblock - es.es_lblk); 1779 + if (retval > map->m_len) 1780 + retval = map->m_len; 1781 + map->m_len = retval; 1782 + if (ext4_es_is_written(&es)) 1783 + map->m_flags |= EXT4_MAP_MAPPED; 1784 + else if (ext4_es_is_unwritten(&es)) 1785 + map->m_flags |= EXT4_MAP_UNWRITTEN; 1786 + else 1787 + BUG_ON(1); 1788 + 1789 + return retval; 1790 + } 1791 + 1779 1792 /* 1780 1793 * Try to see if we can get the block without requesting a new 1781 1794 * file system block. ··· 1830 1771 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 1831 1772 retval = 0; 1832 1773 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1833 - retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1774 + retval = ext4_ext_map_blocks(NULL, inode, map, 1775 + EXT4_GET_BLOCKS_NO_PUT_HOLE); 1834 1776 else 1835 - retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1777 + retval = ext4_ind_map_blocks(NULL, inode, map, 1778 + EXT4_GET_BLOCKS_NO_PUT_HOLE); 1836 1779 1780 + add_delayed: 1837 1781 if (retval == 0) { 1838 1782 int ret; 1839 1783 /*
+56
include/trace/events/ext4.h
··· 2199 2199 __entry->pblk, __entry->status) 2200 2200 ); 2201 2201 2202 + TRACE_EVENT(ext4_es_lookup_extent_enter, 2203 + TP_PROTO(struct inode *inode, ext4_lblk_t lblk), 2204 + 2205 + TP_ARGS(inode, lblk), 2206 + 2207 + TP_STRUCT__entry( 2208 + __field( dev_t, dev ) 2209 + __field( ino_t, ino ) 2210 + __field( ext4_lblk_t, lblk ) 2211 + ), 2212 + 2213 + TP_fast_assign( 2214 + __entry->dev = inode->i_sb->s_dev; 2215 + __entry->ino = inode->i_ino; 2216 + __entry->lblk = lblk; 2217 + ), 2218 + 2219 + TP_printk("dev %d,%d ino %lu lblk %u", 2220 + MAJOR(__entry->dev), MINOR(__entry->dev), 2221 + (unsigned long) __entry->ino, __entry->lblk) 2222 + ); 2223 + 2224 + TRACE_EVENT(ext4_es_lookup_extent_exit, 2225 + TP_PROTO(struct inode *inode, struct extent_status *es, 2226 + int found), 2227 + 2228 + TP_ARGS(inode, es, found), 2229 + 2230 + TP_STRUCT__entry( 2231 + __field( dev_t, dev ) 2232 + __field( ino_t, ino ) 2233 + __field( ext4_lblk_t, lblk ) 2234 + __field( ext4_lblk_t, len ) 2235 + __field( ext4_fsblk_t, pblk ) 2236 + __field( unsigned long long, status ) 2237 + __field( int, found ) 2238 + ), 2239 + 2240 + TP_fast_assign( 2241 + __entry->dev = inode->i_sb->s_dev; 2242 + __entry->ino = inode->i_ino; 2243 + __entry->lblk = es->es_lblk; 2244 + __entry->len = es->es_len; 2245 + __entry->pblk = ext4_es_pblock(es); 2246 + __entry->status = ext4_es_status(es); 2247 + __entry->found = found; 2248 + ), 2249 + 2250 + TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx", 2251 + MAJOR(__entry->dev), MINOR(__entry->dev), 2252 + (unsigned long) __entry->ino, __entry->found, 2253 + __entry->lblk, __entry->len, 2254 + __entry->found ? __entry->pblk : 0, 2255 + __entry->found ? __entry->status : 0) 2256 + ); 2257 + 2202 2258 #endif /* _TRACE_EXT4_H */ 2203 2259 2204 2260 /* This part must be outside protection */