Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: track all extent status in extent status tree

By recording the phycisal block and status, extent status tree is able
to track the status of every extents. When we call _map_blocks
functions to lookup an extent or create a new written/unwritten/delayed
extent, this extent will be inserted into extent status tree.

We don't load all extents from disk in alloc_inode() because it costs
too much memory, and if a file is opened and closed frequently it will
takes too much time to load all extent information. So currently when
we create/lookup an extent, this extent will be inserted into extent
status tree. Hence, the extent status tree may not comprehensively
contain all of the extents found in the file.

Here a condition we need to take care is that an extent might contains
unwritten and delayed status simultaneously because an extent is delayed
allocated and could be allocated by fallocate. At this time we need to
keep delayed status because later we need to update delayed reservation
space using it.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan kara <jack@suse.cz>

authored by

Zheng Liu and committed by
Theodore Ts'o
f7fec032 a25a4e1a

+69 -30
+3
fs/ext4/ext4.h
··· 2602 2602 struct ext4_ext_path *); 2603 2603 extern void ext4_ext_drop_refs(struct ext4_ext_path *); 2604 2604 extern int ext4_ext_check_inode(struct inode *inode); 2605 + extern int ext4_find_delalloc_range(struct inode *inode, 2606 + ext4_lblk_t lblk_start, 2607 + ext4_lblk_t lblk_end); 2605 2608 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2606 2609 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2607 2610 __u64 start, __u64 len);
+15 -5
fs/ext4/extents.c
··· 2076 2076 break; 2077 2077 } 2078 2078 2079 - /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ 2080 - if (next == next_del) { 2079 + /* 2080 + * This is possible iff next == next_del == EXT_MAX_BLOCKS. 2081 + * we need to check next == EXT_MAX_BLOCKS because it is 2082 + * possible that an extent is with unwritten and delayed 2083 + * status due to when an extent is delayed allocated and 2084 + * is allocated by fallocate status tree will track both of 2085 + * them in a extent. 2086 + * 2087 + * So we could return a unwritten and delayed extent, and 2088 + * its block is equal to 'next'. 2089 + */ 2090 + if (next == next_del && next == EXT_MAX_BLOCKS) { 2081 2091 flags |= FIEMAP_EXTENT_LAST; 2082 2092 if (unlikely(next_del != EXT_MAX_BLOCKS || 2083 2093 next != EXT_MAX_BLOCKS)) { ··· 3532 3522 * 3533 3523 * Return 1 if there is a delalloc block in the range, otherwise 0. 3534 3524 */ 3535 - static int ext4_find_delalloc_range(struct inode *inode, 3536 - ext4_lblk_t lblk_start, 3537 - ext4_lblk_t lblk_end) 3525 + int ext4_find_delalloc_range(struct inode *inode, 3526 + ext4_lblk_t lblk_start, 3527 + ext4_lblk_t lblk_end) 3538 3528 { 3539 3529 struct extent_status es; 3540 3530
+51 -25
fs/ext4/inode.c
··· 526 526 retval = ext4_ind_map_blocks(handle, inode, map, flags & 527 527 EXT4_GET_BLOCKS_KEEP_SIZE); 528 528 } 529 + if (retval > 0) { 530 + int ret; 531 + unsigned long long status; 532 + 533 + status = map->m_flags & EXT4_MAP_UNWRITTEN ? 534 + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 535 + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 536 + ext4_find_delalloc_range(inode, map->m_lblk, 537 + map->m_lblk + map->m_len - 1)) 538 + status |= EXTENT_STATUS_DELAYED; 539 + ret = ext4_es_insert_extent(inode, map->m_lblk, 540 + map->m_len, map->m_pblk, status); 541 + if (ret < 0) 542 + retval = ret; 543 + } 529 544 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 530 545 up_read((&EXT4_I(inode)->i_data_sem)); 531 546 532 547 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 533 - int ret; 534 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 535 - /* delayed alloc may be allocated by fallocate and 536 - * coverted to initialized by directIO. 537 - * we need to handle delayed extent here. 538 - */ 539 - down_write((&EXT4_I(inode)->i_data_sem)); 540 - goto delayed_mapped; 541 - } 542 - ret = check_block_validity(inode, map); 548 + int ret = check_block_validity(inode, map); 543 549 if (ret != 0) 544 550 return ret; 545 551 } ··· 614 608 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 615 609 ext4_da_update_reserve_space(inode, retval, 1); 616 610 } 617 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 611 + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 618 612 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 619 613 620 - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 621 - int ret; 622 - delayed_mapped: 623 - /* delayed allocation blocks has been allocated */ 624 - ret = ext4_es_remove_extent(inode, map->m_lblk, 625 - map->m_len); 626 - if (ret < 0) 627 - retval = ret; 628 - } 614 + if (retval > 0) { 615 + int ret; 616 + unsigned long long status; 617 + 618 + status = map->m_flags & EXT4_MAP_UNWRITTEN ? 619 + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 620 + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 621 + ext4_find_delalloc_range(inode, map->m_lblk, 622 + map->m_lblk + map->m_len - 1)) 623 + status |= EXTENT_STATUS_DELAYED; 624 + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 625 + map->m_pblk, status); 626 + if (ret < 0) 627 + retval = ret; 629 628 } 630 629 631 630 up_write((&EXT4_I(inode)->i_data_sem)); ··· 1776 1765 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1777 1766 1778 1767 if (retval == 0) { 1768 + int ret; 1779 1769 /* 1780 1770 * XXX: __block_prepare_write() unmaps passed block, 1781 1771 * is it OK? ··· 1784 1772 /* If the block was allocated from previously allocated cluster, 1785 1773 * then we dont need to reserve it again. */ 1786 1774 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { 1787 - retval = ext4_da_reserve_space(inode, iblock); 1788 - if (retval) 1775 + ret = ext4_da_reserve_space(inode, iblock); 1776 + if (ret) { 1789 1777 /* not enough space to reserve */ 1778 + retval = ret; 1790 1779 goto out_unlock; 1780 + } 1791 1781 } 1792 1782 1793 - retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1794 - ~0, EXTENT_STATUS_DELAYED); 1795 - if (retval) 1783 + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1784 + ~0, EXTENT_STATUS_DELAYED); 1785 + if (ret) { 1786 + retval = ret; 1796 1787 goto out_unlock; 1788 + } 1797 1789 1798 1790 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served 1799 1791 * and it should not appear on the bh->b_state. ··· 1807 1791 map_bh(bh, inode->i_sb, invalid_block); 1808 1792 set_buffer_new(bh); 1809 1793 set_buffer_delay(bh); 1794 + } else if (retval > 0) { 1795 + int ret; 1796 + unsigned long long status; 1797 + 1798 + status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1799 + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1800 + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1801 + map->m_pblk, status); 1802 + if (ret != 0) 1803 + retval = ret; 1810 1804 } 1811 1805 1812 1806 out_unlock: