Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 update from Ted Ts'o:
"There are two major features for this merge window. The first is
inline data, which allows small files or directories to be stored in
the in-inode extended attribute area. (This requires that the file
system use inodes which are at least 256 bytes or larger; 128 byte
inodes do not have any room for in-inode xattrs.)

The second new feature is SEEK_HOLE/SEEK_DATA support. This is
enabled by the extent status tree patches, and this infrastructure
will be used to further optimize ext4 in the future.

Beyond that, we have the usual collection of code cleanups and bug
fixes."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (63 commits)
ext4: zero out inline data using memset() instead of empty_zero_page
ext4: ensure Inode flags consistency are checked at build time
ext4: Remove CONFIG_EXT4_FS_XATTR
ext4: remove unused variable from ext4_ext_in_cache()
ext4: remove redundant initialization in ext4_fill_super()
ext4: remove redundant code in ext4_alloc_inode()
ext4: use sync_inode_metadata() when syncing inode metadata
ext4: enable ext4 inline support
ext4: let fallocate handle inline data correctly
ext4: let ext4_truncate handle inline data correctly
ext4: evict inline data out if we need to strore xattr in inode
ext4: let fiemap work with inline data
ext4: let ext4_rename handle inline dir
ext4: let empty_dir handle inline dir
ext4: let ext4_delete_entry() handle inline data
ext4: make ext4_delete_entry generic
ext4: let ext4_find_entry handle inline data
ext4: create a new function search_dir
ext4: let ext4_readdir handle inline data
ext4: let add_dir_entry handle inline data properly
...

+4176 -1107
+3 -6
Documentation/filesystems/ext4.txt
··· 200 200 table readahead algorithm will pre-read into 201 201 the buffer cache. The default value is 32 blocks. 202 202 203 - nouser_xattr Disables Extended User Attributes. If you have extended 204 - attribute support enabled in the kernel configuration 205 - (CONFIG_EXT4_FS_XATTR), extended attribute support 206 - is enabled by default on mount. See the attr(5) manual 207 - page and http://acl.bestbits.at/ for more information 208 - about extended attributes. 203 + nouser_xattr Disables Extended User Attributes. See the 204 + attr(5) manual page and http://acl.bestbits.at/ 205 + for more information about extended attributes. 209 206 210 207 noacl This option disables POSIX Access Control List 211 208 support. If ACL support is enabled in the kernel
+2 -2
fs/Kconfig
··· 28 28 tristate 29 29 default y if EXT2_FS=y && EXT2_FS_XATTR 30 30 default y if EXT3_FS=y && EXT3_FS_XATTR 31 - default y if EXT4_FS=y && EXT4_FS_XATTR 32 - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR 31 + default y if EXT4_FS=y 32 + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS 33 33 34 34 source "fs/reiserfs/Kconfig" 35 35 source "fs/jfs/Kconfig"
-15
fs/ext4/Kconfig
··· 39 39 compiled kernel size by using one file system driver for 40 40 ext2, ext3, and ext4 file systems. 41 41 42 - config EXT4_FS_XATTR 43 - bool "Ext4 extended attributes" 44 - depends on EXT4_FS 45 - default y 46 - help 47 - Extended attributes are name:value pairs associated with inodes by 48 - the kernel or by users (see the attr(5) manual page, or visit 49 - <http://acl.bestbits.at/> for details). 50 - 51 - If unsure, say N. 52 - 53 - You need this for POSIX ACL support on ext4. 54 - 55 42 config EXT4_FS_POSIX_ACL 56 43 bool "Ext4 POSIX Access Control Lists" 57 - depends on EXT4_FS_XATTR 58 44 select FS_POSIX_ACL 59 45 help 60 46 POSIX Access Control Lists (ACLs) support permissions for users and ··· 53 67 54 68 config EXT4_FS_SECURITY 55 69 bool "Ext4 Security Labels" 56 - depends on EXT4_FS_XATTR 57 70 help 58 71 Security labels support alternative access control models 59 72 implemented by security modules like SELinux. This option
+2 -2
fs/ext4/Makefile
··· 7 7 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 8 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 9 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 10 - mmp.o indirect.o 10 + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ 11 + xattr_trusted.o inline.o 11 12 12 - ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 13 13 ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 14 14 ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+4 -2
fs/ext4/acl.c
··· 423 423 424 424 retry: 425 425 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 426 - if (IS_ERR(handle)) 427 - return PTR_ERR(handle); 426 + if (IS_ERR(handle)) { 427 + error = PTR_ERR(handle); 428 + goto release_and_out; 429 + } 428 430 error = ext4_set_acl(handle, inode, type, acl); 429 431 ext4_journal_stop(handle); 430 432 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+20 -21
fs/ext4/dir.c
··· 27 27 #include <linux/slab.h> 28 28 #include <linux/rbtree.h> 29 29 #include "ext4.h" 30 - 31 - static unsigned char ext4_filetype_table[] = { 32 - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 33 - }; 30 + #include "xattr.h" 34 31 35 32 static int ext4_dx_readdir(struct file *filp, 36 33 void *dirent, filldir_t filldir); 37 - 38 - static unsigned char get_dtype(struct super_block *sb, int filetype) 39 - { 40 - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || 41 - (filetype >= EXT4_FT_MAX)) 42 - return DT_UNKNOWN; 43 - 44 - return (ext4_filetype_table[filetype]); 45 - } 46 34 47 35 /** 48 36 * Check if the given dir-inode refers to an htree-indexed directory ··· 56 68 * Return 0 if the directory entry is OK, and 1 if there is a problem 57 69 * 58 70 * Note: this is the opposite of what ext2 and ext3 historically returned... 71 + * 72 + * bh passed here can be an inode block or a dir data block, depending 73 + * on the inode inline data flag. 59 74 */ 60 75 int __ext4_check_dir_entry(const char *function, unsigned int line, 61 76 struct inode *dir, struct file *filp, 62 77 struct ext4_dir_entry_2 *de, 63 - struct buffer_head *bh, 78 + struct buffer_head *bh, char *buf, int size, 64 79 unsigned int offset) 65 80 { 66 81 const char *error_msg = NULL; ··· 76 85 error_msg = "rec_len % 4 != 0"; 77 86 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 78 87 error_msg = "rec_len is too small for name_len"; 79 - else if (unlikely(((char *) de - bh->b_data) + rlen > 80 - dir->i_sb->s_blocksize)) 81 - error_msg = "directory entry across blocks"; 88 + else if (unlikely(((char *) de - buf) + rlen > size)) 89 + error_msg = "directory entry across range"; 82 90 else if (unlikely(le32_to_cpu(de->inode) > 83 91 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) 84 92 error_msg = "inode out of bounds"; ··· 88 98 ext4_error_file(filp, function, line, bh->b_blocknr, 89 99 "bad entry in directory: %s - offset=%u(%u), " 90 100 "inode=%u, rec_len=%d, name_len=%d", 91 - error_msg, (unsigned) (offset % bh->b_size), 101 + error_msg, (unsigned) (offset % size), 92 102 offset, le32_to_cpu(de->inode), 93 103 rlen, de->name_len); 94 104 else 95 105 ext4_error_inode(dir, function, line, bh->b_blocknr, 96 106 "bad entry in directory: %s - offset=%u(%u), " 97 107 "inode=%u, rec_len=%d, name_len=%d", 98 - error_msg, (unsigned) (offset % bh->b_size), 108 + error_msg, (unsigned) (offset % size), 99 109 offset, le32_to_cpu(de->inode), 100 110 rlen, de->name_len); 101 111 ··· 114 124 struct super_block *sb = inode->i_sb; 115 125 int ret = 0; 116 126 int dir_has_error = 0; 127 + 128 + if (ext4_has_inline_data(inode)) { 129 + int has_inline_data = 1; 130 + ret = ext4_read_inline_dir(filp, dirent, filldir, 131 + &has_inline_data); 132 + if (has_inline_data) 133 + return ret; 134 + } 117 135 118 136 if (is_dx_dir(inode)) { 119 137 err = ext4_dx_readdir(filp, dirent, filldir); ··· 219 221 while (!error && filp->f_pos < inode->i_size 220 222 && offset < sb->s_blocksize) { 221 223 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 222 - if (ext4_check_dir_entry(inode, filp, de, 223 - bh, offset)) { 224 + if (ext4_check_dir_entry(inode, filp, de, bh, 225 + bh->b_data, bh->b_size, 226 + offset)) { 224 227 /* 225 228 * On error, skip the f_pos to the next block 226 229 */
+135 -28
fs/ext4/ext4.h
··· 57 57 #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 58 58 #endif 59 59 60 + /* 61 + * Turn on EXT_DEBUG to get lots of info about extents operations. 62 + */ 63 + #define EXT_DEBUG__ 64 + #ifdef EXT_DEBUG 65 + #define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) 66 + #else 67 + #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 68 + #endif 69 + 60 70 #define EXT4_ERROR_INODE(inode, fmt, a...) \ 61 71 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) 62 72 ··· 402 392 #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 403 393 #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ 404 394 #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ 395 + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ 405 396 #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 406 397 407 398 #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ ··· 459 448 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ 460 449 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ 461 450 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ 451 + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ 462 452 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ 463 453 }; 464 454 465 - #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) 466 - #define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ 467 - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ 468 - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } 469 - 470 455 /* 471 - * Since it's pretty easy to mix up bit numbers and hex values, and we 472 - * can't do a compile-time test for ENUM values, we use a run-time 473 - * test to make sure that EXT4_XXX_FL is consistent with respect to 474 - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop 475 - * out so it won't cost any extra space in the compiled kernel image. 476 - * But it's important that these values are the same, since we are 477 - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL 478 - * must be consistent with the values of FS_XXX_FL defined in 479 - * include/linux/fs.h and the on-disk values found in ext2, ext3, and 480 - * ext4 filesystems, and of course the values defined in e2fsprogs. 456 + * Since it's pretty easy to mix up bit numbers and hex values, we use a 457 + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to 458 + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost 459 + * any extra space in the compiled kernel image, otherwise, the build will fail. 460 + * It's important that these values are the same, since we are using 461 + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent 462 + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk 463 + * values found in ext2, ext3 and ext4 filesystems, and of course the values 464 + * defined in e2fsprogs. 481 465 * 482 466 * It's not paranoia if the Murphy's Law really *is* out to get you. :-) 483 467 */ 468 + #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) 469 + #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) 470 + 484 471 static inline void ext4_check_flag_values(void) 485 472 { 486 473 CHECK_FLAG_VALUE(SECRM); ··· 503 494 CHECK_FLAG_VALUE(EXTENTS); 504 495 CHECK_FLAG_VALUE(EA_INODE); 505 496 CHECK_FLAG_VALUE(EOFBLOCKS); 497 + CHECK_FLAG_VALUE(INLINE_DATA); 506 498 CHECK_FLAG_VALUE(RESERVED); 507 499 } 508 500 ··· 821 811 __u32 ec_len; /* must be 32bit to return holes */ 822 812 }; 823 813 814 + #include "extents_status.h" 815 + 824 816 /* 825 817 * fourth extended file system inode data in memory 826 818 */ ··· 845 833 #endif 846 834 unsigned long i_flags; 847 835 848 - #ifdef CONFIG_EXT4_FS_XATTR 849 836 /* 850 837 * Extended attributes can be read independently of the main file 851 838 * data. Taking i_mutex even when reading would cause contention ··· 853 842 * EAs. 854 843 */ 855 844 struct rw_semaphore xattr_sem; 856 - #endif 857 845 858 846 struct list_head i_orphan; /* unlinked but open inodes */ 859 847 ··· 898 888 struct list_head i_prealloc_list; 899 889 spinlock_t i_prealloc_lock; 900 890 891 + /* extents status tree */ 892 + struct ext4_es_tree i_es_tree; 893 + rwlock_t i_es_lock; 894 + 901 895 /* ialloc */ 902 896 ext4_group_t i_last_alloc_group; 903 897 ··· 915 901 916 902 /* on-disk additional length */ 917 903 __u16 i_extra_isize; 904 + 905 + /* Indicate the inline data space. */ 906 + u16 i_inline_off; 907 + u16 i_inline_size; 918 908 919 909 #ifdef CONFIG_QUOTA 920 910 /* quota space reservation, managed internally by quota code */ ··· 1378 1360 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1379 1361 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1380 1362 nolocking */ 1363 + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1381 1364 }; 1382 1365 1383 1366 #define EXT4_INODE_BIT_FNS(name, field, offset) \ ··· 1500 1481 #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1501 1482 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ 1502 1483 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1503 - #define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ 1484 + #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ 1504 1485 1505 1486 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1506 1487 #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ ··· 1524 1505 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1525 1506 EXT4_FEATURE_INCOMPAT_64BIT| \ 1526 1507 EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1527 - EXT4_FEATURE_INCOMPAT_MMP) 1508 + EXT4_FEATURE_INCOMPAT_MMP | \ 1509 + EXT4_FEATURE_INCOMPAT_INLINE_DATA) 1528 1510 #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1529 1511 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1530 1512 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ ··· 1611 1591 __u8 det_reserved_ft; /* 0xDE, fake file type */ 1612 1592 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ 1613 1593 }; 1594 + 1595 + #define EXT4_DIRENT_TAIL(block, blocksize) \ 1596 + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ 1597 + ((blocksize) - \ 1598 + sizeof(struct ext4_dir_entry_tail)))) 1614 1599 1615 1600 /* 1616 1601 * Ext4 directory file types. Only the low 3 bits are used. The ··· 1961 1936 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1962 1937 struct file *, 1963 1938 struct ext4_dir_entry_2 *, 1964 - struct buffer_head *, unsigned int); 1965 - #define ext4_check_dir_entry(dir, filp, de, bh, offset) \ 1939 + struct buffer_head *, char *, int, 1940 + unsigned int); 1941 + #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ 1966 1942 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 1967 - (de), (bh), (offset))) 1943 + (de), (bh), (buf), (size), (offset))) 1968 1944 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1969 1945 __u32 minor_hash, 1970 1946 struct ext4_dir_entry_2 *dirent); 1971 1947 extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1948 + extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, 1949 + struct buffer_head *bh, 1950 + void *buf, int buf_size, 1951 + const char *name, int namelen, 1952 + struct ext4_dir_entry_2 **dest_de); 1953 + void ext4_insert_dentry(struct inode *inode, 1954 + struct ext4_dir_entry_2 *de, 1955 + int buf_size, 1956 + const char *name, int namelen); 1957 + static inline void ext4_update_dx_flag(struct inode *inode) 1958 + { 1959 + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 1960 + EXT4_FEATURE_COMPAT_DIR_INDEX)) 1961 + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); 1962 + } 1963 + static unsigned char ext4_filetype_table[] = { 1964 + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 1965 + }; 1966 + 1967 + static inline unsigned char get_dtype(struct super_block *sb, int filetype) 1968 + { 1969 + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || 1970 + (filetype >= EXT4_FT_MAX)) 1971 + return DT_UNKNOWN; 1972 + 1973 + return ext4_filetype_table[filetype]; 1974 + } 1972 1975 1973 1976 /* fsync.c */ 1974 1977 extern int ext4_sync_file(struct file *, loff_t, loff_t, int); ··· 2047 1994 ext4_lblk_t, int, int *); 2048 1995 struct buffer_head *ext4_bread(handle_t *, struct inode *, 2049 1996 ext4_lblk_t, int, int *); 1997 + int ext4_get_block_write(struct inode *inode, sector_t iblock, 1998 + struct buffer_head *bh_result, int create); 2050 1999 int ext4_get_block(struct inode *inode, sector_t iblock, 2051 2000 struct buffer_head *bh_result, int create); 2001 + int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2002 + struct buffer_head *bh, int create); 2003 + int ext4_walk_page_buffers(handle_t *handle, 2004 + struct buffer_head *head, 2005 + unsigned from, 2006 + unsigned to, 2007 + int *partial, 2008 + int (*fn)(handle_t *handle, 2009 + struct buffer_head *bh)); 2010 + int do_journal_get_write_access(handle_t *handle, 2011 + struct buffer_head *bh); 2012 + #define FALL_BACK_TO_NONDELALLOC 1 2013 + #define CONVERT_INLINE_DATA 2 2052 2014 2053 2015 extern struct inode *ext4_iget(struct super_block *, unsigned long); 2054 2016 extern int ext4_write_inode(struct inode *, struct writeback_control *); ··· 2118 2050 extern int ext4_orphan_del(handle_t *, struct inode *); 2119 2051 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 2120 2052 __u32 start_minor_hash, __u32 *next_hash); 2053 + extern int search_dir(struct buffer_head *bh, 2054 + char *search_buf, 2055 + int buf_size, 2056 + struct inode *dir, 2057 + const struct qstr *d_name, 2058 + unsigned int offset, 2059 + struct ext4_dir_entry_2 **res_dir); 2060 + extern int ext4_generic_delete_entry(handle_t *handle, 2061 + struct inode *dir, 2062 + struct ext4_dir_entry_2 *de_del, 2063 + struct buffer_head *bh, 2064 + void *entry_buf, 2065 + int buf_size, 2066 + int csum_size); 2121 2067 2122 2068 /* resize.c */ 2123 2069 extern int ext4_group_add(struct super_block *sb, ··· 2458 2376 extern const struct inode_operations ext4_dir_inode_operations; 2459 2377 extern const struct inode_operations ext4_special_inode_operations; 2460 2378 extern struct dentry *ext4_get_parent(struct dentry *child); 2379 + extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2380 + struct ext4_dir_entry_2 *de, 2381 + int blocksize, int csum_size, 2382 + unsigned int parent_ino, int dotdot_real_len); 2383 + extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, 2384 + unsigned int blocksize); 2385 + extern int ext4_handle_dirty_dirent_node(handle_t *handle, 2386 + struct inode *inode, 2387 + struct buffer_head *bh); 2461 2388 2462 2389 /* symlink.c */ 2463 2390 extern const struct inode_operations ext4_symlink_inode_operations; ··· 2484 2393 struct inode *, __le32 *, unsigned int); 2485 2394 2486 2395 /* extents.c */ 2396 + struct ext4_ext_path; 2397 + struct ext4_extent; 2398 + 2487 2399 extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2488 2400 extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2489 2401 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, ··· 2504 2410 ssize_t len); 2505 2411 extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2506 2412 struct ext4_map_blocks *map, int flags); 2413 + extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2414 + ext4_lblk_t lblocks); 2415 + extern int ext4_extent_tree_init(handle_t *, struct inode *); 2416 + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 2417 + int num, 2418 + struct ext4_ext_path *path); 2419 + extern int ext4_can_extents_be_merged(struct inode *inode, 2420 + struct ext4_extent *ex1, 2421 + struct ext4_extent *ex2); 2422 + extern int ext4_ext_insert_extent(handle_t *, struct inode *, 2423 + struct ext4_ext_path *, 2424 + struct ext4_extent *, int); 2425 + extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 2426 + struct ext4_ext_path *); 2427 + extern void ext4_ext_drop_refs(struct ext4_ext_path *); 2428 + extern int ext4_ext_check_inode(struct inode *inode); 2429 + extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2507 2430 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2508 2431 __u64 start, __u64 len); 2432 + 2433 + 2509 2434 /* move_extent.c */ 2510 2435 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2511 2436 __u64 start_orig, __u64 start_donor, ··· 2558 2445 * never, ever appear in a buffer_head's state 2559 2446 * flag. See EXT4_MAP_FROM_CLUSTER to see where 2560 2447 * this is used. */ 2561 - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This 2562 - * flag is set when ext4_map_blocks is called on a 2563 - * delayed allocated block to get its real mapping. */ 2564 2448 }; 2565 2449 2566 2450 BUFFER_FNS(Uninit, uninit) 2567 2451 TAS_BUFFER_FNS(Uninit, uninit) 2568 - BUFFER_FNS(Da_Mapped, da_mapped) 2569 2452 2570 2453 /* 2571 2454 * Add new method to test whether block and inode bitmaps are properly ··· 2611 2502 extern void ext4_resize_end(struct super_block *sb); 2612 2503 2613 2504 #endif /* __KERNEL__ */ 2614 - 2615 - #include "ext4_extents.h" 2616 2505 2617 2506 #endif /* _EXT4_H */
-40
fs/ext4/ext4_extents.h
··· 43 43 #define CHECK_BINSEARCH__ 44 44 45 45 /* 46 - * Turn on EXT_DEBUG to get lots of info about extents operations. 47 - */ 48 - #define EXT_DEBUG__ 49 - #ifdef EXT_DEBUG 50 - #define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) 51 - #else 52 - #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 53 - #endif 54 - 55 - /* 56 46 * If EXT_STATS is defined then stats numbers are collected. 57 47 * These number will be displayed at umount time. 58 48 */ ··· 132 142 /* 133 143 * structure for external API 134 144 */ 135 - 136 - /* 137 - * to be called by ext4_ext_walk_space() 138 - * negative retcode - error 139 - * positive retcode - signal for ext4_ext_walk_space(), see below 140 - * callback must return valid extent (passed or newly created) 141 - */ 142 - typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, 143 - struct ext4_ext_cache *, 144 - struct ext4_extent *, void *); 145 - 146 - #define EXT_CONTINUE 0 147 - #define EXT_BREAK 1 148 - #define EXT_REPEAT 2 149 145 150 146 /* 151 147 * Maximum number of logical blocks in a file; ext4_extent's ee_block is ··· 276 300 0xffff); 277 301 } 278 302 279 - extern int ext4_ext_calc_metadata_amount(struct inode *inode, 280 - ext4_lblk_t lblocks); 281 - extern int ext4_extent_tree_init(handle_t *, struct inode *); 282 - extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, 283 - int num, 284 - struct ext4_ext_path *path); 285 - extern int ext4_can_extents_be_merged(struct inode *inode, 286 - struct ext4_extent *ex1, 287 - struct ext4_extent *ex2); 288 - extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); 289 - extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 290 - struct ext4_ext_path *); 291 - extern void ext4_ext_drop_refs(struct ext4_ext_path *); 292 - extern int ext4_ext_check_inode(struct inode *inode); 293 - extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, 294 - int search_hint_reverse); 295 303 #endif /* _EXT4_EXTENTS */ 296 304
-7
fs/ext4/ext4_jbd2.h
··· 254 254 handle->h_sync = 1; 255 255 } 256 256 257 - static inline void ext4_handle_release_buffer(handle_t *handle, 258 - struct buffer_head *bh) 259 - { 260 - if (ext4_handle_valid(handle)) 261 - jbd2_journal_release_buffer(handle, bh); 262 - } 263 - 264 257 static inline int ext4_handle_is_aborted(handle_t *handle) 265 258 { 266 259 if (ext4_handle_valid(handle))
+148 -332
fs/ext4/extents.c
··· 41 41 #include <asm/uaccess.h> 42 42 #include <linux/fiemap.h> 43 43 #include "ext4_jbd2.h" 44 + #include "ext4_extents.h" 45 + #include "xattr.h" 44 46 45 47 #include <trace/events/ext4.h> 46 48 ··· 110 108 ext4_lblk_t split, 111 109 int split_flag, 112 110 int flags); 111 + 112 + static int ext4_find_delayed_extent(struct inode *inode, 113 + struct ext4_ext_cache *newex); 113 114 114 115 static int ext4_ext_truncate_extend_restart(handle_t *handle, 115 116 struct inode *inode, ··· 1964 1959 return err; 1965 1960 } 1966 1961 1967 - static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1968 - ext4_lblk_t num, ext_prepare_callback func, 1969 - void *cbdata) 1962 + static int ext4_fill_fiemap_extents(struct inode *inode, 1963 + ext4_lblk_t block, ext4_lblk_t num, 1964 + struct fiemap_extent_info *fieinfo) 1970 1965 { 1971 1966 struct ext4_ext_path *path = NULL; 1972 - struct ext4_ext_cache cbex; 1967 + struct ext4_ext_cache newex; 1973 1968 struct ext4_extent *ex; 1974 - ext4_lblk_t next, start = 0, end = 0; 1969 + ext4_lblk_t next, next_del, start = 0, end = 0; 1975 1970 ext4_lblk_t last = block + num; 1976 - int depth, exists, err = 0; 1977 - 1978 - BUG_ON(func == NULL); 1979 - BUG_ON(inode == NULL); 1971 + int exists, depth = 0, err = 0; 1972 + unsigned int flags = 0; 1973 + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 1980 1974 1981 1975 while (block < last && block != EXT_MAX_BLOCKS) { 1982 1976 num = last - block; 1983 1977 /* find extent for this block */ 1984 1978 down_read(&EXT4_I(inode)->i_data_sem); 1979 + 1980 + if (path && ext_depth(inode) != depth) { 1981 + /* depth was changed. we have to realloc path */ 1982 + kfree(path); 1983 + path = NULL; 1984 + } 1985 + 1985 1986 path = ext4_ext_find_extent(inode, block, path); 1986 - up_read(&EXT4_I(inode)->i_data_sem); 1987 1987 if (IS_ERR(path)) { 1988 + up_read(&EXT4_I(inode)->i_data_sem); 1988 1989 err = PTR_ERR(path); 1989 1990 path = NULL; 1990 1991 break; ··· 1998 1987 1999 1988 depth = ext_depth(inode); 2000 1989 if (unlikely(path[depth].p_hdr == NULL)) { 1990 + up_read(&EXT4_I(inode)->i_data_sem); 2001 1991 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2002 1992 err = -EIO; 2003 1993 break; 2004 1994 } 2005 1995 ex = path[depth].p_ext; 2006 1996 next = ext4_ext_next_allocated_block(path); 1997 + ext4_ext_drop_refs(path); 2007 1998 1999 + flags = 0; 2008 2000 exists = 0; 2009 2001 if (!ex) { 2010 2002 /* there is no extent yet, so try to allocate ··· 2044 2030 BUG_ON(end <= start); 2045 2031 2046 2032 if (!exists) { 2047 - cbex.ec_block = start; 2048 - cbex.ec_len = end - start; 2049 - cbex.ec_start = 0; 2033 + newex.ec_block = start; 2034 + newex.ec_len = end - start; 2035 + newex.ec_start = 0; 2050 2036 } else { 2051 - cbex.ec_block = le32_to_cpu(ex->ee_block); 2052 - cbex.ec_len = ext4_ext_get_actual_len(ex); 2053 - cbex.ec_start = ext4_ext_pblock(ex); 2037 + newex.ec_block = le32_to_cpu(ex->ee_block); 2038 + newex.ec_len = ext4_ext_get_actual_len(ex); 2039 + newex.ec_start = ext4_ext_pblock(ex); 2040 + if (ext4_ext_is_uninitialized(ex)) 2041 + flags |= FIEMAP_EXTENT_UNWRITTEN; 2054 2042 } 2055 2043 2056 - if (unlikely(cbex.ec_len == 0)) { 2057 - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 2044 + /* 2045 + * Find delayed extent and update newex accordingly. We call 2046 + * it even in !exists case to find out whether newex is the 2047 + * last existing extent or not. 2048 + */ 2049 + next_del = ext4_find_delayed_extent(inode, &newex); 2050 + if (!exists && next_del) { 2051 + exists = 1; 2052 + flags |= FIEMAP_EXTENT_DELALLOC; 2053 + } 2054 + up_read(&EXT4_I(inode)->i_data_sem); 2055 + 2056 + if (unlikely(newex.ec_len == 0)) { 2057 + EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); 2058 2058 err = -EIO; 2059 2059 break; 2060 2060 } 2061 - err = func(inode, next, &cbex, ex, cbdata); 2062 - ext4_ext_drop_refs(path); 2063 2061 2064 - if (err < 0) 2065 - break; 2066 - 2067 - if (err == EXT_REPEAT) 2068 - continue; 2069 - else if (err == EXT_BREAK) { 2070 - err = 0; 2071 - break; 2062 + /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ 2063 + if (next == next_del) { 2064 + flags |= FIEMAP_EXTENT_LAST; 2065 + if (unlikely(next_del != EXT_MAX_BLOCKS || 2066 + next != EXT_MAX_BLOCKS)) { 2067 + EXT4_ERROR_INODE(inode, 2068 + "next extent == %u, next " 2069 + "delalloc extent = %u", 2070 + next, next_del); 2071 + err = -EIO; 2072 + break; 2073 + } 2072 2074 } 2073 2075 2074 - if (ext_depth(inode) != depth) { 2075 - /* depth was changed. we have to realloc path */ 2076 - kfree(path); 2077 - path = NULL; 2076 + if (exists) { 2077 + err = fiemap_fill_next_extent(fieinfo, 2078 + (__u64)newex.ec_block << blksize_bits, 2079 + (__u64)newex.ec_start << blksize_bits, 2080 + (__u64)newex.ec_len << blksize_bits, 2081 + flags); 2082 + if (err < 0) 2083 + break; 2084 + if (err == 1) { 2085 + err = 0; 2086 + break; 2087 + } 2078 2088 } 2079 2089 2080 - block = cbex.ec_block + cbex.ec_len; 2090 + block = newex.ec_block + newex.ec_len; 2081 2091 } 2082 2092 2083 2093 if (path) { ··· 2194 2156 struct ext4_extent *ex) 2195 2157 { 2196 2158 struct ext4_ext_cache *cex; 2197 - struct ext4_sb_info *sbi; 2198 2159 int ret = 0; 2199 2160 2200 2161 /* ··· 2201 2164 */ 2202 2165 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2203 2166 cex = &EXT4_I(inode)->i_cached_extent; 2204 - sbi = EXT4_SB(inode->i_sb); 2205 2167 2206 2168 /* has cache valid data? */ 2207 2169 if (cex->ec_len == 0) ··· 2309 2273 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2310 2274 { 2311 2275 int index; 2312 - int depth = ext_depth(inode); 2276 + int depth; 2277 + 2278 + /* If we are converting the inline data, only one is needed here. */ 2279 + if (ext4_has_inline_data(inode)) 2280 + return 1; 2281 + 2282 + depth = ext_depth(inode); 2313 2283 2314 2284 if (chunk) 2315 2285 index = depth * 2; ··· 3503 3461 /** 3504 3462 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3505 3463 * 3506 - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns 3507 - * whether there are any buffers marked for delayed allocation. It returns '1' 3508 - * on the first delalloc'ed buffer head found. If no buffer head in the given 3509 - * range is marked for delalloc, it returns 0. 3510 - * lblk_start should always be <= lblk_end. 3511 - * search_hint_reverse is to indicate that searching in reverse from lblk_end to 3512 - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed 3513 - * block sooner). This is useful when blocks are truncated sequentially from 3514 - * lblk_start towards lblk_end. 3464 + * Return 1 if there is a delalloc block in the range, otherwise 0. 3515 3465 */ 3516 3466 static int ext4_find_delalloc_range(struct inode *inode, 3517 3467 ext4_lblk_t lblk_start, 3518 - ext4_lblk_t lblk_end, 3519 - int search_hint_reverse) 3468 + ext4_lblk_t lblk_end) 3520 3469 { 3521 - struct address_space *mapping = inode->i_mapping; 3522 - struct buffer_head *head, *bh = NULL; 3523 - struct page *page; 3524 - ext4_lblk_t i, pg_lblk; 3525 - pgoff_t index; 3470 + struct extent_status es; 3526 3471 3527 - if (!test_opt(inode->i_sb, DELALLOC)) 3528 - return 0; 3529 - 3530 - /* reverse search wont work if fs block size is less than page size */ 3531 - if (inode->i_blkbits < PAGE_CACHE_SHIFT) 3532 - search_hint_reverse = 0; 3533 - 3534 - if (search_hint_reverse) 3535 - i = lblk_end; 3472 + es.start = lblk_start; 3473 + ext4_es_find_extent(inode, &es); 3474 + if (es.len == 0) 3475 + return 0; /* there is no delay extent in this tree */ 3476 + else if (es.start <= lblk_start && lblk_start < es.start + es.len) 3477 + return 1; 3478 + else if (lblk_start <= es.start && es.start <= lblk_end) 3479 + return 1; 3536 3480 else 3537 - i = lblk_start; 3538 - 3539 - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 3540 - 3541 - while ((i >= lblk_start) && (i <= lblk_end)) { 3542 - page = find_get_page(mapping, index); 3543 - if (!page) 3544 - goto nextpage; 3545 - 3546 - if (!page_has_buffers(page)) 3547 - goto nextpage; 3548 - 3549 - head = page_buffers(page); 3550 - if (!head) 3551 - goto nextpage; 3552 - 3553 - bh = head; 3554 - pg_lblk = index << (PAGE_CACHE_SHIFT - 3555 - inode->i_blkbits); 3556 - do { 3557 - if (unlikely(pg_lblk < lblk_start)) { 3558 - /* 3559 - * This is possible when fs block size is less 3560 - * than page size and our cluster starts/ends in 3561 - * middle of the page. So we need to skip the 3562 - * initial few blocks till we reach the 'lblk' 3563 - */ 3564 - pg_lblk++; 3565 - continue; 3566 - } 3567 - 3568 - /* Check if the buffer is delayed allocated and that it 3569 - * is not yet mapped. (when da-buffers are mapped during 3570 - * their writeout, their da_mapped bit is set.) 3571 - */ 3572 - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { 3573 - page_cache_release(page); 3574 - trace_ext4_find_delalloc_range(inode, 3575 - lblk_start, lblk_end, 3576 - search_hint_reverse, 3577 - 1, i); 3578 - return 1; 3579 - } 3580 - if (search_hint_reverse) 3581 - i--; 3582 - else 3583 - i++; 3584 - } while ((i >= lblk_start) && (i <= lblk_end) && 3585 - ((bh = bh->b_this_page) != head)); 3586 - nextpage: 3587 - if (page) 3588 - page_cache_release(page); 3589 - /* 3590 - * Move to next page. 'i' will be the first lblk in the next 3591 - * page. 3592 - */ 3593 - if (search_hint_reverse) 3594 - index--; 3595 - else 3596 - index++; 3597 - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 3598 - } 3599 - 3600 - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, 3601 - search_hint_reverse, 0, 0); 3602 - return 0; 3481 + return 0; 3603 3482 } 3604 3483 3605 - int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, 3606 - int search_hint_reverse) 3484 + int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) 3607 3485 { 3608 3486 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3609 3487 ext4_lblk_t lblk_start, lblk_end; 3610 3488 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3611 3489 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3612 3490 3613 - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, 3614 - search_hint_reverse); 3491 + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3615 3492 } 3616 3493 3617 3494 /** ··· 3591 3630 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3592 3631 lblk_to = lblk_from + c_offset - 1; 3593 3632 3594 - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3633 + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3595 3634 allocated_clusters--; 3596 3635 } 3597 3636 ··· 3601 3640 lblk_from = lblk_start + num_blks; 3602 3641 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3603 3642 3604 - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3643 + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3605 3644 allocated_clusters--; 3606 3645 } 3607 3646 ··· 3624 3663 flags, allocated); 3625 3664 ext4_ext_show_leaf(inode, path); 3626 3665 3627 - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, 3628 - newblock); 3666 + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3667 + allocated, newblock); 3629 3668 3630 3669 /* get_block() before submit the IO, split the extent */ 3631 3670 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ··· 3872 3911 struct ext4_extent newex, *ex, *ex2; 3873 3912 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3874 3913 ext4_fsblk_t newblock = 0; 3875 - int free_on_err = 0, err = 0, depth, ret; 3914 + int free_on_err = 0, err = 0, depth; 3876 3915 unsigned int allocated = 0, offset = 0; 3877 3916 unsigned int allocated_clusters = 0; 3878 3917 struct ext4_allocation_request ar; ··· 3888 3927 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3889 3928 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3890 3929 if ((sbi->s_cluster_ratio > 1) && 3891 - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3930 + ext4_find_delalloc_cluster(inode, map->m_lblk)) 3892 3931 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3893 3932 3894 3933 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ··· 3968 4007 ee_len, ee_start); 3969 4008 goto out; 3970 4009 } 3971 - ret = ext4_ext_handle_uninitialized_extents( 4010 + allocated = ext4_ext_handle_uninitialized_extents( 3972 4011 handle, inode, map, path, flags, 3973 4012 allocated, newblock); 3974 - return ret; 4013 + goto out3; 3975 4014 } 3976 4015 } 3977 4016 3978 4017 if ((sbi->s_cluster_ratio > 1) && 3979 - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 4018 + ext4_find_delalloc_cluster(inode, map->m_lblk)) 3980 4019 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3981 4020 3982 4021 /* ··· 4245 4284 kfree(path); 4246 4285 } 4247 4286 4248 - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4249 - newblock, map->m_len, err ? err : allocated); 4287 + out3: 4288 + trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4250 4289 4251 4290 return err ? err : allocated; 4252 4291 } ··· 4305 4344 4306 4345 last_block = (inode->i_size + sb->s_blocksize - 1) 4307 4346 >> EXT4_BLOCK_SIZE_BITS(sb); 4347 + err = ext4_es_remove_extent(inode, last_block, 4348 + EXT_MAX_BLOCKS - last_block); 4308 4349 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4309 4350 4310 4351 /* In a multi-transaction truncate, we only make the final ··· 4396 4433 4397 4434 if (mode & FALLOC_FL_PUNCH_HOLE) 4398 4435 return ext4_punch_hole(file, offset, len); 4436 + 4437 + ret = ext4_convert_inline_data(inode); 4438 + if (ret) 4439 + return ret; 4399 4440 4400 4441 trace_ext4_fallocate_enter(inode, offset, len, mode); 4401 4442 map.m_lblk = offset >> blkbits; ··· 4539 4572 } 4540 4573 4541 4574 /* 4542 - * Callback function called for each extent to gather FIEMAP information. 4575 + * If newex is not existing extent (newex->ec_start equals zero) find 4576 + * delayed extent at start of newex and update newex accordingly and 4577 + * return start of the next delayed extent. 4578 + * 4579 + * If newex is existing extent (newex->ec_start is not equal zero) 4580 + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed 4581 + * extent found. Leave newex unmodified. 4543 4582 */ 4544 - static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, 4545 - struct ext4_ext_cache *newex, struct ext4_extent *ex, 4546 - void *data) 4583 + static int ext4_find_delayed_extent(struct inode *inode, 4584 + struct ext4_ext_cache *newex) 4547 4585 { 4548 - __u64 logical; 4549 - __u64 physical; 4550 - __u64 length; 4551 - __u32 flags = 0; 4552 - int ret = 0; 4553 - struct fiemap_extent_info *fieinfo = data; 4554 - unsigned char blksize_bits; 4586 + struct extent_status es; 4587 + ext4_lblk_t next_del; 4555 4588 4556 - blksize_bits = inode->i_sb->s_blocksize_bits; 4557 - logical = (__u64)newex->ec_block << blksize_bits; 4589 + es.start = newex->ec_block; 4590 + next_del = ext4_es_find_extent(inode, &es); 4558 4591 4559 4592 if (newex->ec_start == 0) { 4560 4593 /* 4561 4594 * No extent in extent-tree contains block @newex->ec_start, 4562 4595 * then the block may stay in 1)a hole or 2)delayed-extent. 4563 - * 4564 - * Holes or delayed-extents are processed as follows. 4565 - * 1. lookup dirty pages with specified range in pagecache. 4566 - * If no page is got, then there is no delayed-extent and 4567 - * return with EXT_CONTINUE. 4568 - * 2. find the 1st mapped buffer, 4569 - * 3. check if the mapped buffer is both in the request range 4570 - * and a delayed buffer. If not, there is no delayed-extent, 4571 - * then return. 4572 - * 4. a delayed-extent is found, the extent will be collected. 4573 4596 */ 4574 - ext4_lblk_t end = 0; 4575 - pgoff_t last_offset; 4576 - pgoff_t offset; 4577 - pgoff_t index; 4578 - pgoff_t start_index = 0; 4579 - struct page **pages = NULL; 4580 - struct buffer_head *bh = NULL; 4581 - struct buffer_head *head = NULL; 4582 - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); 4597 + if (es.len == 0) 4598 + /* A hole found. */ 4599 + return 0; 4583 4600 4584 - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); 4585 - if (pages == NULL) 4586 - return -ENOMEM; 4587 - 4588 - offset = logical >> PAGE_SHIFT; 4589 - repeat: 4590 - last_offset = offset; 4591 - head = NULL; 4592 - ret = find_get_pages_tag(inode->i_mapping, &offset, 4593 - PAGECACHE_TAG_DIRTY, nr_pages, pages); 4594 - 4595 - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 4596 - /* First time, try to find a mapped buffer. */ 4597 - if (ret == 0) { 4598 - out: 4599 - for (index = 0; index < ret; index++) 4600 - page_cache_release(pages[index]); 4601 - /* just a hole. */ 4602 - kfree(pages); 4603 - return EXT_CONTINUE; 4604 - } 4605 - index = 0; 4606 - 4607 - next_page: 4608 - /* Try to find the 1st mapped buffer. */ 4609 - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> 4610 - blksize_bits; 4611 - if (!page_has_buffers(pages[index])) 4612 - goto out; 4613 - head = page_buffers(pages[index]); 4614 - if (!head) 4615 - goto out; 4616 - 4617 - index++; 4618 - bh = head; 4619 - do { 4620 - if (end >= newex->ec_block + 4621 - newex->ec_len) 4622 - /* The buffer is out of 4623 - * the request range. 4624 - */ 4625 - goto out; 4626 - 4627 - if (buffer_mapped(bh) && 4628 - end >= newex->ec_block) { 4629 - start_index = index - 1; 4630 - /* get the 1st mapped buffer. */ 4631 - goto found_mapped_buffer; 4632 - } 4633 - 4634 - bh = bh->b_this_page; 4635 - end++; 4636 - } while (bh != head); 4637 - 4638 - /* No mapped buffer in the range found in this page, 4639 - * We need to look up next page. 4640 - */ 4641 - if (index >= ret) { 4642 - /* There is no page left, but we need to limit 4643 - * newex->ec_len. 4644 - */ 4645 - newex->ec_len = end - newex->ec_block; 4646 - goto out; 4647 - } 4648 - goto next_page; 4649 - } else { 4650 - /*Find contiguous delayed buffers. */ 4651 - if (ret > 0 && pages[0]->index == last_offset) 4652 - head = page_buffers(pages[0]); 4653 - bh = head; 4654 - index = 1; 4655 - start_index = 0; 4601 + if (es.start > newex->ec_block) { 4602 + /* A hole found. */ 4603 + newex->ec_len = min(es.start - newex->ec_block, 4604 + newex->ec_len); 4605 + return 0; 4656 4606 } 4657 4607 4658 - found_mapped_buffer: 4659 - if (bh != NULL && buffer_delay(bh)) { 4660 - /* 1st or contiguous delayed buffer found. */ 4661 - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 4662 - /* 4663 - * 1st delayed buffer found, record 4664 - * the start of extent. 4665 - */ 4666 - flags |= FIEMAP_EXTENT_DELALLOC; 4667 - newex->ec_block = end; 4668 - logical = (__u64)end << blksize_bits; 4669 - } 4670 - /* Find contiguous delayed buffers. */ 4671 - do { 4672 - if (!buffer_delay(bh)) 4673 - goto found_delayed_extent; 4674 - bh = bh->b_this_page; 4675 - end++; 4676 - } while (bh != head); 4677 - 4678 - for (; index < ret; index++) { 4679 - if (!page_has_buffers(pages[index])) { 4680 - bh = NULL; 4681 - break; 4682 - } 4683 - head = page_buffers(pages[index]); 4684 - if (!head) { 4685 - bh = NULL; 4686 - break; 4687 - } 4688 - 4689 - if (pages[index]->index != 4690 - pages[start_index]->index + index 4691 - - start_index) { 4692 - /* Blocks are not contiguous. */ 4693 - bh = NULL; 4694 - break; 4695 - } 4696 - bh = head; 4697 - do { 4698 - if (!buffer_delay(bh)) 4699 - /* Delayed-extent ends. */ 4700 - goto found_delayed_extent; 4701 - bh = bh->b_this_page; 4702 - end++; 4703 - } while (bh != head); 4704 - } 4705 - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) 4706 - /* a hole found. */ 4707 - goto out; 4708 - 4709 - found_delayed_extent: 4710 - newex->ec_len = min(end - newex->ec_block, 4711 - (ext4_lblk_t)EXT_INIT_MAX_LEN); 4712 - if (ret == nr_pages && bh != NULL && 4713 - newex->ec_len < EXT_INIT_MAX_LEN && 4714 - buffer_delay(bh)) { 4715 - /* Have not collected an extent and continue. */ 4716 - for (index = 0; index < ret; index++) 4717 - page_cache_release(pages[index]); 4718 - goto repeat; 4719 - } 4720 - 4721 - for (index = 0; index < ret; index++) 4722 - page_cache_release(pages[index]); 4723 - kfree(pages); 4608 + newex->ec_len = es.start + es.len - newex->ec_block; 4724 4609 } 4725 4610 4726 - physical = (__u64)newex->ec_start << blksize_bits; 4727 - length = (__u64)newex->ec_len << blksize_bits; 4728 - 4729 - if (ex && ext4_ext_is_uninitialized(ex)) 4730 - flags |= FIEMAP_EXTENT_UNWRITTEN; 4731 - 4732 - if (next == EXT_MAX_BLOCKS) 4733 - flags |= FIEMAP_EXTENT_LAST; 4734 - 4735 - ret = fiemap_fill_next_extent(fieinfo, logical, physical, 4736 - length, flags); 4737 - if (ret < 0) 4738 - return ret; 4739 - if (ret == 1) 4740 - return EXT_BREAK; 4741 - return EXT_CONTINUE; 4611 + return next_del; 4742 4612 } 4743 4613 /* fiemap flags we can handle specified here */ 4744 4614 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) ··· 4775 4971 ext4_ext_invalidate_cache(inode); 4776 4972 ext4_discard_preallocations(inode); 4777 4973 4974 + err = ext4_es_remove_extent(inode, first_block, 4975 + stop_block - first_block); 4778 4976 err = ext4_ext_remove_space(inode, first_block, stop_block - 1); 4779 4977 4780 4978 ext4_ext_invalidate_cache(inode); ··· 4797 4991 mutex_unlock(&inode->i_mutex); 4798 4992 return err; 4799 4993 } 4994 + 4800 4995 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4801 4996 __u64 start, __u64 len) 4802 4997 { 4803 4998 ext4_lblk_t start_blk; 4804 4999 int error = 0; 5000 + 5001 + if (ext4_has_inline_data(inode)) { 5002 + int has_inline = 1; 5003 + 5004 + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 5005 + 5006 + if (has_inline) 5007 + return error; 5008 + } 4805 5009 4806 5010 /* fallback to generic here if not in extents fmt */ 4807 5011 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) ··· 4834 5018 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4835 5019 4836 5020 /* 4837 - * Walk the extent tree gathering extent information. 4838 - * ext4_ext_fiemap_cb will push extents back to user. 5021 + * Walk the extent tree gathering extent information 5022 + * and pushing extents back to the user. 4839 5023 */ 4840 - error = ext4_ext_walk_space(inode, start_blk, len_blks, 4841 - ext4_ext_fiemap_cb, fieinfo); 5024 + error = ext4_fill_fiemap_extents(inode, start_blk, 5025 + len_blks, fieinfo); 4842 5026 } 4843 5027 4844 5028 return error;
+500
fs/ext4/extents_status.c
··· 1 + /* 2 + * fs/ext4/extents_status.c 3 + * 4 + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> 5 + * Modified by 6 + * Allison Henderson <achender@linux.vnet.ibm.com> 7 + * Hugh Dickins <hughd@google.com> 8 + * Zheng Liu <wenqing.lz@taobao.com> 9 + * 10 + * Ext4 extents status tree core functions. 11 + */ 12 + #include <linux/rbtree.h> 13 + #include "ext4.h" 14 + #include "extents_status.h" 15 + #include "ext4_extents.h" 16 + 17 + #include <trace/events/ext4.h> 18 + 19 + /* 20 + * According to previous discussion in Ext4 Developer Workshop, we 21 + * will introduce a new structure called io tree to track all extent 22 + * status in order to solve some problems that we have met 23 + * (e.g. Reservation space warning), and provide extent-level locking. 24 + * Delay extent tree is the first step to achieve this goal. It is 25 + * original built by Yongqiang Yang. At that time it is called delay 26 + * extent tree, whose goal is only track delay extent in memory to 27 + * simplify the implementation of fiemap and bigalloc, and introduce 28 + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called 29 + * delay extent tree at the following comment. But for better 30 + * understand what it does, it has been rename to extent status tree. 31 + * 32 + * Currently the first step has been done. All delay extents are 33 + * tracked in the tree. It maintains the delay extent when a delay 34 + * allocation is issued, and the delay extent is written out or 35 + * invalidated. Therefore the implementation of fiemap and bigalloc 36 + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. 37 + * 38 + * The following comment describes the implemenmtation of extent 39 + * status tree and future works. 40 + */ 41 + 42 + /* 43 + * extents status tree implementation for ext4. 44 + * 45 + * 46 + * ========================================================================== 47 + * Extents status encompass delayed extents and extent locks 48 + * 49 + * 1. Why delayed extent implementation ? 50 + * 51 + * Without delayed extent, ext4 identifies a delayed extent by looking 52 + * up page cache, this has several deficiencies - complicated, buggy, 53 + * and inefficient code. 54 + * 55 + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need 56 + * to know if a block or a range of blocks are belonged to a delayed 57 + * extent. 58 + * 59 + * Let us have a look at how they do without delayed extents implementation. 60 + * -- FIEMAP 61 + * FIEMAP looks up page cache to identify delayed allocations from holes. 62 + * 63 + * -- SEEK_HOLE/DATA 64 + * SEEK_HOLE/DATA has the same problem as FIEMAP. 65 + * 66 + * -- bigalloc 67 + * bigalloc looks up page cache to figure out if a block is 68 + * already under delayed allocation or not to determine whether 69 + * quota reserving is needed for the cluster. 70 + * 71 + * -- punch hole 72 + * punch hole looks up page cache to identify a delayed extent. 73 + * 74 + * -- writeout 75 + * Writeout looks up whole page cache to see if a buffer is 76 + * mapped, If there are not very many delayed buffers, then it is 77 + * time comsuming. 78 + * 79 + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, 80 + * bigalloc and writeout can figure out if a block or a range of 81 + * blocks is under delayed allocation(belonged to a delayed extent) or 82 + * not by searching the delayed extent tree. 83 + * 84 + * 85 + * ========================================================================== 86 + * 2. ext4 delayed extents impelmentation 87 + * 88 + * -- delayed extent 89 + * A delayed extent is a range of blocks which are contiguous 90 + * logically and under delayed allocation. Unlike extent in 91 + * ext4, delayed extent in ext4 is a in-memory struct, there is 92 + * no corresponding on-disk data. There is no limit on length of 93 + * delayed extent, so a delayed extent can contain as many blocks 94 + * as they are contiguous logically. 95 + * 96 + * -- delayed extent tree 97 + * Every inode has a delayed extent tree and all under delayed 98 + * allocation blocks are added to the tree as delayed extents. 99 + * Delayed extents in the tree are ordered by logical block no. 100 + * 101 + * -- operations on a delayed extent tree 102 + * There are three operations on a delayed extent tree: find next 103 + * delayed extent, adding a space(a range of blocks) and removing 104 + * a space. 105 + * 106 + * -- race on a delayed extent tree 107 + * Delayed extent tree is protected inode->i_es_lock. 108 + * 109 + * 110 + * ========================================================================== 111 + * 3. performance analysis 112 + * -- overhead 113 + * 1. There is a cache extent for write access, so if writes are 114 + * not very random, adding space operaions are in O(1) time. 115 + * 116 + * -- gain 117 + * 2. Code is much simpler, more readable, more maintainable and 118 + * more efficient. 119 + * 120 + * 121 + * ========================================================================== 122 + * 4. TODO list 123 + * -- Track all extent status 124 + * 125 + * -- Improve get block process 126 + * 127 + * -- Extent-level locking 128 + */ 129 + 130 + static struct kmem_cache *ext4_es_cachep; 131 + 132 + int __init ext4_init_es(void) 133 + { 134 + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); 135 + if (ext4_es_cachep == NULL) 136 + return -ENOMEM; 137 + return 0; 138 + } 139 + 140 + void ext4_exit_es(void) 141 + { 142 + if (ext4_es_cachep) 143 + kmem_cache_destroy(ext4_es_cachep); 144 + } 145 + 146 + void ext4_es_init_tree(struct ext4_es_tree *tree) 147 + { 148 + tree->root = RB_ROOT; 149 + tree->cache_es = NULL; 150 + } 151 + 152 + #ifdef ES_DEBUG__ 153 + static void ext4_es_print_tree(struct inode *inode) 154 + { 155 + struct ext4_es_tree *tree; 156 + struct rb_node *node; 157 + 158 + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); 159 + tree = &EXT4_I(inode)->i_es_tree; 160 + node = rb_first(&tree->root); 161 + while (node) { 162 + struct extent_status *es; 163 + es = rb_entry(node, struct extent_status, rb_node); 164 + printk(KERN_DEBUG " [%u/%u)", es->start, es->len); 165 + node = rb_next(node); 166 + } 167 + printk(KERN_DEBUG "\n"); 168 + } 169 + #else 170 + #define ext4_es_print_tree(inode) 171 + #endif 172 + 173 + static inline ext4_lblk_t extent_status_end(struct extent_status *es) 174 + { 175 + BUG_ON(es->start + es->len < es->start); 176 + return es->start + es->len - 1; 177 + } 178 + 179 + /* 180 + * search through the tree for an delayed extent with a given offset. If 181 + * it can't be found, try to find next extent. 182 + */ 183 + static struct extent_status *__es_tree_search(struct rb_root *root, 184 + ext4_lblk_t offset) 185 + { 186 + struct rb_node *node = root->rb_node; 187 + struct extent_status *es = NULL; 188 + 189 + while (node) { 190 + es = rb_entry(node, struct extent_status, rb_node); 191 + if (offset < es->start) 192 + node = node->rb_left; 193 + else if (offset > extent_status_end(es)) 194 + node = node->rb_right; 195 + else 196 + return es; 197 + } 198 + 199 + if (es && offset < es->start) 200 + return es; 201 + 202 + if (es && offset > extent_status_end(es)) { 203 + node = rb_next(&es->rb_node); 204 + return node ? rb_entry(node, struct extent_status, rb_node) : 205 + NULL; 206 + } 207 + 208 + return NULL; 209 + } 210 + 211 + /* 212 + * ext4_es_find_extent: find the 1st delayed extent covering @es->start 213 + * if it exists, otherwise, the next extent after @es->start. 214 + * 215 + * @inode: the inode which owns delayed extents 216 + * @es: delayed extent that we found 217 + * 218 + * Returns the first block of the next extent after es, otherwise 219 + * EXT_MAX_BLOCKS if no delay extent is found. 220 + * Delayed extent is returned via @es. 221 + */ 222 + ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) 223 + { 224 + struct ext4_es_tree *tree = NULL; 225 + struct extent_status *es1 = NULL; 226 + struct rb_node *node; 227 + ext4_lblk_t ret = EXT_MAX_BLOCKS; 228 + 229 + trace_ext4_es_find_extent_enter(inode, es->start); 230 + 231 + read_lock(&EXT4_I(inode)->i_es_lock); 232 + tree = &EXT4_I(inode)->i_es_tree; 233 + 234 + /* find delay extent in cache firstly */ 235 + if (tree->cache_es) { 236 + es1 = tree->cache_es; 237 + if (in_range(es->start, es1->start, es1->len)) { 238 + es_debug("%u cached by [%u/%u)\n", 239 + es->start, es1->start, es1->len); 240 + goto out; 241 + } 242 + } 243 + 244 + es->len = 0; 245 + es1 = __es_tree_search(&tree->root, es->start); 246 + 247 + out: 248 + if (es1) { 249 + tree->cache_es = es1; 250 + es->start = es1->start; 251 + es->len = es1->len; 252 + node = rb_next(&es1->rb_node); 253 + if (node) { 254 + es1 = rb_entry(node, struct extent_status, rb_node); 255 + ret = es1->start; 256 + } 257 + } 258 + 259 + read_unlock(&EXT4_I(inode)->i_es_lock); 260 + 261 + trace_ext4_es_find_extent_exit(inode, es, ret); 262 + return ret; 263 + } 264 + 265 + static struct extent_status * 266 + ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) 267 + { 268 + struct extent_status *es; 269 + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); 270 + if (es == NULL) 271 + return NULL; 272 + es->start = start; 273 + es->len = len; 274 + return es; 275 + } 276 + 277 + static void ext4_es_free_extent(struct extent_status *es) 278 + { 279 + kmem_cache_free(ext4_es_cachep, es); 280 + } 281 + 282 + static struct extent_status * 283 + ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) 284 + { 285 + struct extent_status *es1; 286 + struct rb_node *node; 287 + 288 + node = rb_prev(&es->rb_node); 289 + if (!node) 290 + return es; 291 + 292 + es1 = rb_entry(node, struct extent_status, rb_node); 293 + if (es->start == extent_status_end(es1) + 1) { 294 + es1->len += es->len; 295 + rb_erase(&es->rb_node, &tree->root); 296 + ext4_es_free_extent(es); 297 + es = es1; 298 + } 299 + 300 + return es; 301 + } 302 + 303 + static struct extent_status * 304 + ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) 305 + { 306 + struct extent_status *es1; 307 + struct rb_node *node; 308 + 309 + node = rb_next(&es->rb_node); 310 + if (!node) 311 + return es; 312 + 313 + es1 = rb_entry(node, struct extent_status, rb_node); 314 + if (es1->start == extent_status_end(es) + 1) { 315 + es->len += es1->len; 316 + rb_erase(node, &tree->root); 317 + ext4_es_free_extent(es1); 318 + } 319 + 320 + return es; 321 + } 322 + 323 + static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, 324 + ext4_lblk_t len) 325 + { 326 + struct rb_node **p = &tree->root.rb_node; 327 + struct rb_node *parent = NULL; 328 + struct extent_status *es; 329 + ext4_lblk_t end = offset + len - 1; 330 + 331 + BUG_ON(end < offset); 332 + es = tree->cache_es; 333 + if (es && offset == (extent_status_end(es) + 1)) { 334 + es_debug("cached by [%u/%u)\n", es->start, es->len); 335 + es->len += len; 336 + es = ext4_es_try_to_merge_right(tree, es); 337 + goto out; 338 + } else if (es && es->start == end + 1) { 339 + es_debug("cached by [%u/%u)\n", es->start, es->len); 340 + es->start = offset; 341 + es->len += len; 342 + es = ext4_es_try_to_merge_left(tree, es); 343 + goto out; 344 + } else if (es && es->start <= offset && 345 + end <= extent_status_end(es)) { 346 + es_debug("cached by [%u/%u)\n", es->start, es->len); 347 + goto out; 348 + } 349 + 350 + while (*p) { 351 + parent = *p; 352 + es = rb_entry(parent, struct extent_status, rb_node); 353 + 354 + if (offset < es->start) { 355 + if (es->start == end + 1) { 356 + es->start = offset; 357 + es->len += len; 358 + es = ext4_es_try_to_merge_left(tree, es); 359 + goto out; 360 + } 361 + p = &(*p)->rb_left; 362 + } else if (offset > extent_status_end(es)) { 363 + if (offset == extent_status_end(es) + 1) { 364 + es->len += len; 365 + es = ext4_es_try_to_merge_right(tree, es); 366 + goto out; 367 + } 368 + p = &(*p)->rb_right; 369 + } else { 370 + if (extent_status_end(es) <= end) 371 + es->len = offset - es->start + len; 372 + goto out; 373 + } 374 + } 375 + 376 + es = ext4_es_alloc_extent(offset, len); 377 + if (!es) 378 + return -ENOMEM; 379 + rb_link_node(&es->rb_node, parent, p); 380 + rb_insert_color(&es->rb_node, &tree->root); 381 + 382 + out: 383 + tree->cache_es = es; 384 + return 0; 385 + } 386 + 387 + /* 388 + * ext4_es_insert_extent() adds a space to a delayed extent tree. 389 + * Caller holds inode->i_es_lock. 390 + * 391 + * ext4_es_insert_extent is called by ext4_da_write_begin and 392 + * ext4_es_remove_extent. 393 + * 394 + * Return 0 on success, error code on failure. 395 + */ 396 + int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, 397 + ext4_lblk_t len) 398 + { 399 + struct ext4_es_tree *tree; 400 + int err = 0; 401 + 402 + trace_ext4_es_insert_extent(inode, offset, len); 403 + es_debug("add [%u/%u) to extent status tree of inode %lu\n", 404 + offset, len, inode->i_ino); 405 + 406 + write_lock(&EXT4_I(inode)->i_es_lock); 407 + tree = &EXT4_I(inode)->i_es_tree; 408 + err = __es_insert_extent(tree, offset, len); 409 + write_unlock(&EXT4_I(inode)->i_es_lock); 410 + 411 + ext4_es_print_tree(inode); 412 + 413 + return err; 414 + } 415 + 416 + /* 417 + * ext4_es_remove_extent() removes a space from a delayed extent tree. 418 + * Caller holds inode->i_es_lock. 419 + * 420 + * Return 0 on success, error code on failure. 421 + */ 422 + int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, 423 + ext4_lblk_t len) 424 + { 425 + struct rb_node *node; 426 + struct ext4_es_tree *tree; 427 + struct extent_status *es; 428 + struct extent_status orig_es; 429 + ext4_lblk_t len1, len2, end; 430 + int err = 0; 431 + 432 + trace_ext4_es_remove_extent(inode, offset, len); 433 + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 434 + offset, len, inode->i_ino); 435 + 436 + end = offset + len - 1; 437 + BUG_ON(end < offset); 438 + write_lock(&EXT4_I(inode)->i_es_lock); 439 + tree = &EXT4_I(inode)->i_es_tree; 440 + es = __es_tree_search(&tree->root, offset); 441 + if (!es) 442 + goto out; 443 + if (es->start > end) 444 + goto out; 445 + 446 + /* Simply invalidate cache_es. */ 447 + tree->cache_es = NULL; 448 + 449 + orig_es.start = es->start; 450 + orig_es.len = es->len; 451 + len1 = offset > es->start ? offset - es->start : 0; 452 + len2 = extent_status_end(es) > end ? 453 + extent_status_end(es) - end : 0; 454 + if (len1 > 0) 455 + es->len = len1; 456 + if (len2 > 0) { 457 + if (len1 > 0) { 458 + err = __es_insert_extent(tree, end + 1, len2); 459 + if (err) { 460 + es->start = orig_es.start; 461 + es->len = orig_es.len; 462 + goto out; 463 + } 464 + } else { 465 + es->start = end + 1; 466 + es->len = len2; 467 + } 468 + goto out; 469 + } 470 + 471 + if (len1 > 0) { 472 + node = rb_next(&es->rb_node); 473 + if (node) 474 + es = rb_entry(node, struct extent_status, rb_node); 475 + else 476 + es = NULL; 477 + } 478 + 479 + while (es && extent_status_end(es) <= end) { 480 + node = rb_next(&es->rb_node); 481 + rb_erase(&es->rb_node, &tree->root); 482 + ext4_es_free_extent(es); 483 + if (!node) { 484 + es = NULL; 485 + break; 486 + } 487 + es = rb_entry(node, struct extent_status, rb_node); 488 + } 489 + 490 + if (es && es->start < end + 1) { 491 + len1 = extent_status_end(es) - end; 492 + es->start = end + 1; 493 + es->len = len1; 494 + } 495 + 496 + out: 497 + write_unlock(&EXT4_I(inode)->i_es_lock); 498 + ext4_es_print_tree(inode); 499 + return err; 500 + }
+45
fs/ext4/extents_status.h
··· 1 + /* 2 + * fs/ext4/extents_status.h 3 + * 4 + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> 5 + * Modified by 6 + * Allison Henderson <achender@linux.vnet.ibm.com> 7 + * Zheng Liu <wenqing.lz@taobao.com> 8 + * 9 + */ 10 + 11 + #ifndef _EXT4_EXTENTS_STATUS_H 12 + #define _EXT4_EXTENTS_STATUS_H 13 + 14 + /* 15 + * Turn on ES_DEBUG__ to get lots of info about extent status operations. 16 + */ 17 + #ifdef ES_DEBUG__ 18 + #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) 19 + #else 20 + #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 21 + #endif 22 + 23 + struct extent_status { 24 + struct rb_node rb_node; 25 + ext4_lblk_t start; /* first block extent covers */ 26 + ext4_lblk_t len; /* length of extent in block */ 27 + }; 28 + 29 + struct ext4_es_tree { 30 + struct rb_root root; 31 + struct extent_status *cache_es; /* recently accessed extent */ 32 + }; 33 + 34 + extern int __init ext4_init_es(void); 35 + extern void ext4_exit_es(void); 36 + extern void ext4_es_init_tree(struct ext4_es_tree *tree); 37 + 38 + extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, 39 + ext4_lblk_t len); 40 + extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, 41 + ext4_lblk_t len); 42 + extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, 43 + struct extent_status *es); 44 + 45 + #endif /* _EXT4_EXTENTS_STATUS_H */
+332 -4
fs/ext4/file.c
··· 24 24 #include <linux/mount.h> 25 25 #include <linux/path.h> 26 26 #include <linux/quotaops.h> 27 + #include <linux/pagevec.h> 27 28 #include "ext4.h" 28 29 #include "ext4_jbd2.h" 29 30 #include "xattr.h" ··· 287 286 } 288 287 289 288 /* 289 + * Here we use ext4_map_blocks() to get a block mapping for a extent-based 290 + * file rather than ext4_ext_walk_space() because we can introduce 291 + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same 292 + * function. When extent status tree has been fully implemented, it will 293 + * track all extent status for a file and we can directly use it to 294 + * retrieve the offset for SEEK_DATA/SEEK_HOLE. 295 + */ 296 + 297 + /* 298 + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to 299 + * lookup page cache to check whether or not there has some data between 300 + * [startoff, endoff] because, if this range contains an unwritten extent, 301 + * we determine this extent as a data or a hole according to whether the 302 + * page cache has data or not. 303 + */ 304 + static int ext4_find_unwritten_pgoff(struct inode *inode, 305 + int origin, 306 + struct ext4_map_blocks *map, 307 + loff_t *offset) 308 + { 309 + struct pagevec pvec; 310 + unsigned int blkbits; 311 + pgoff_t index; 312 + pgoff_t end; 313 + loff_t endoff; 314 + loff_t startoff; 315 + loff_t lastoff; 316 + int found = 0; 317 + 318 + blkbits = inode->i_sb->s_blocksize_bits; 319 + startoff = *offset; 320 + lastoff = startoff; 321 + endoff = (map->m_lblk + map->m_len) << blkbits; 322 + 323 + index = startoff >> PAGE_CACHE_SHIFT; 324 + end = endoff >> PAGE_CACHE_SHIFT; 325 + 326 + pagevec_init(&pvec, 0); 327 + do { 328 + int i, num; 329 + unsigned long nr_pages; 330 + 331 + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 332 + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 333 + (pgoff_t)num); 334 + if (nr_pages == 0) { 335 + if (origin == SEEK_DATA) 336 + break; 337 + 338 + BUG_ON(origin != SEEK_HOLE); 339 + /* 340 + * If this is the first time to go into the loop and 341 + * offset is not beyond the end offset, it will be a 342 + * hole at this offset 343 + */ 344 + if (lastoff == startoff || lastoff < endoff) 345 + found = 1; 346 + break; 347 + } 348 + 349 + /* 350 + * If this is the first time to go into the loop and 351 + * offset is smaller than the first page offset, it will be a 352 + * hole at this offset. 353 + */ 354 + if (lastoff == startoff && origin == SEEK_HOLE && 355 + lastoff < page_offset(pvec.pages[0])) { 356 + found = 1; 357 + break; 358 + } 359 + 360 + for (i = 0; i < nr_pages; i++) { 361 + struct page *page = pvec.pages[i]; 362 + struct buffer_head *bh, *head; 363 + 364 + /* 365 + * If the current offset is not beyond the end of given 366 + * range, it will be a hole. 367 + */ 368 + if (lastoff < endoff && origin == SEEK_HOLE && 369 + page->index > end) { 370 + found = 1; 371 + *offset = lastoff; 372 + goto out; 373 + } 374 + 375 + lock_page(page); 376 + 377 + if (unlikely(page->mapping != inode->i_mapping)) { 378 + unlock_page(page); 379 + continue; 380 + } 381 + 382 + if (!page_has_buffers(page)) { 383 + unlock_page(page); 384 + continue; 385 + } 386 + 387 + if (page_has_buffers(page)) { 388 + lastoff = page_offset(page); 389 + bh = head = page_buffers(page); 390 + do { 391 + if (buffer_uptodate(bh) || 392 + buffer_unwritten(bh)) { 393 + if (origin == SEEK_DATA) 394 + found = 1; 395 + } else { 396 + if (origin == SEEK_HOLE) 397 + found = 1; 398 + } 399 + if (found) { 400 + *offset = max_t(loff_t, 401 + startoff, lastoff); 402 + unlock_page(page); 403 + goto out; 404 + } 405 + lastoff += bh->b_size; 406 + bh = bh->b_this_page; 407 + } while (bh != head); 408 + } 409 + 410 + lastoff = page_offset(page) + PAGE_SIZE; 411 + unlock_page(page); 412 + } 413 + 414 + /* 415 + * The no. of pages is less than our desired, that would be a 416 + * hole in there. 417 + */ 418 + if (nr_pages < num && origin == SEEK_HOLE) { 419 + found = 1; 420 + *offset = lastoff; 421 + break; 422 + } 423 + 424 + index = pvec.pages[i - 1]->index + 1; 425 + pagevec_release(&pvec); 426 + } while (index <= end); 427 + 428 + out: 429 + pagevec_release(&pvec); 430 + return found; 431 + } 432 + 433 + /* 434 + * ext4_seek_data() retrieves the offset for SEEK_DATA. 435 + */ 436 + static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 437 + { 438 + struct inode *inode = file->f_mapping->host; 439 + struct ext4_map_blocks map; 440 + struct extent_status es; 441 + ext4_lblk_t start, last, end; 442 + loff_t dataoff, isize; 443 + int blkbits; 444 + int ret = 0; 445 + 446 + mutex_lock(&inode->i_mutex); 447 + 448 + isize = i_size_read(inode); 449 + if (offset >= isize) { 450 + mutex_unlock(&inode->i_mutex); 451 + return -ENXIO; 452 + } 453 + 454 + blkbits = inode->i_sb->s_blocksize_bits; 455 + start = offset >> blkbits; 456 + last = start; 457 + end = isize >> blkbits; 458 + dataoff = offset; 459 + 460 + do { 461 + map.m_lblk = last; 462 + map.m_len = end - last + 1; 463 + ret = ext4_map_blocks(NULL, inode, &map, 0); 464 + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 465 + if (last != start) 466 + dataoff = last << blkbits; 467 + break; 468 + } 469 + 470 + /* 471 + * If there is a delay extent at this offset, 472 + * it will be as a data. 473 + */ 474 + es.start = last; 475 + (void)ext4_es_find_extent(inode, &es); 476 + if (last >= es.start && 477 + last < es.start + es.len) { 478 + if (last != start) 479 + dataoff = last << blkbits; 480 + break; 481 + } 482 + 483 + /* 484 + * If there is a unwritten extent at this offset, 485 + * it will be as a data or a hole according to page 486 + * cache that has data or not. 487 + */ 488 + if (map.m_flags & EXT4_MAP_UNWRITTEN) { 489 + int unwritten; 490 + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 491 + &map, &dataoff); 492 + if (unwritten) 493 + break; 494 + } 495 + 496 + last++; 497 + dataoff = last << blkbits; 498 + } while (last <= end); 499 + 500 + mutex_unlock(&inode->i_mutex); 501 + 502 + if (dataoff > isize) 503 + return -ENXIO; 504 + 505 + if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 506 + return -EINVAL; 507 + if (dataoff > maxsize) 508 + return -EINVAL; 509 + 510 + if (dataoff != file->f_pos) { 511 + file->f_pos = dataoff; 512 + file->f_version = 0; 513 + } 514 + 515 + return dataoff; 516 + } 517 + 518 + /* 519 + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 520 + */ 521 + static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 522 + { 523 + struct inode *inode = file->f_mapping->host; 524 + struct ext4_map_blocks map; 525 + struct extent_status es; 526 + ext4_lblk_t start, last, end; 527 + loff_t holeoff, isize; 528 + int blkbits; 529 + int ret = 0; 530 + 531 + mutex_lock(&inode->i_mutex); 532 + 533 + isize = i_size_read(inode); 534 + if (offset >= isize) { 535 + mutex_unlock(&inode->i_mutex); 536 + return -ENXIO; 537 + } 538 + 539 + blkbits = inode->i_sb->s_blocksize_bits; 540 + start = offset >> blkbits; 541 + last = start; 542 + end = isize >> blkbits; 543 + holeoff = offset; 544 + 545 + do { 546 + map.m_lblk = last; 547 + map.m_len = end - last + 1; 548 + ret = ext4_map_blocks(NULL, inode, &map, 0); 549 + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 550 + last += ret; 551 + holeoff = last << blkbits; 552 + continue; 553 + } 554 + 555 + /* 556 + * If there is a delay extent at this offset, 557 + * we will skip this extent. 558 + */ 559 + es.start = last; 560 + (void)ext4_es_find_extent(inode, &es); 561 + if (last >= es.start && 562 + last < es.start + es.len) { 563 + last = es.start + es.len; 564 + holeoff = last << blkbits; 565 + continue; 566 + } 567 + 568 + /* 569 + * If there is a unwritten extent at this offset, 570 + * it will be as a data or a hole according to page 571 + * cache that has data or not. 572 + */ 573 + if (map.m_flags & EXT4_MAP_UNWRITTEN) { 574 + int unwritten; 575 + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 576 + &map, &holeoff); 577 + if (!unwritten) { 578 + last += ret; 579 + holeoff = last << blkbits; 580 + continue; 581 + } 582 + } 583 + 584 + /* find a hole */ 585 + break; 586 + } while (last <= end); 587 + 588 + mutex_unlock(&inode->i_mutex); 589 + 590 + if (holeoff > isize) 591 + holeoff = isize; 592 + 593 + if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 594 + return -EINVAL; 595 + if (holeoff > maxsize) 596 + return -EINVAL; 597 + 598 + if (holeoff != file->f_pos) { 599 + file->f_pos = holeoff; 600 + file->f_version = 0; 601 + } 602 + 603 + return holeoff; 604 + } 605 + 606 + /* 290 607 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 291 608 * by calling generic_file_llseek_size() with the appropriate maxbytes 292 609 * value for each. ··· 619 300 else 620 301 maxbytes = inode->i_sb->s_maxbytes; 621 302 622 - return generic_file_llseek_size(file, offset, origin, 623 - maxbytes, i_size_read(inode)); 303 + switch (origin) { 304 + case SEEK_SET: 305 + case SEEK_CUR: 306 + case SEEK_END: 307 + return generic_file_llseek_size(file, offset, origin, 308 + maxbytes, i_size_read(inode)); 309 + case SEEK_DATA: 310 + return ext4_seek_data(file, offset, maxbytes); 311 + case SEEK_HOLE: 312 + return ext4_seek_hole(file, offset, maxbytes); 313 + } 314 + 315 + return -EINVAL; 624 316 } 625 317 626 318 const struct file_operations ext4_file_operations = { ··· 656 326 const struct inode_operations ext4_file_inode_operations = { 657 327 .setattr = ext4_setattr, 658 328 .getattr = ext4_getattr, 659 - #ifdef CONFIG_EXT4_FS_XATTR 660 329 .setxattr = generic_setxattr, 661 330 .getxattr = generic_getxattr, 662 331 .listxattr = ext4_listxattr, 663 332 .removexattr = generic_removexattr, 664 - #endif 665 333 .get_acl = ext4_get_acl, 666 334 .fiemap = ext4_fiemap, 667 335 };
+1 -5
fs/ext4/fsync.c
··· 44 44 */ 45 45 static int ext4_sync_parent(struct inode *inode) 46 46 { 47 - struct writeback_control wbc; 48 47 struct dentry *dentry = NULL; 49 48 struct inode *next; 50 49 int ret = 0; ··· 65 66 ret = sync_mapping_buffers(inode->i_mapping); 66 67 if (ret) 67 68 break; 68 - memset(&wbc, 0, sizeof(wbc)); 69 - wbc.sync_mode = WB_SYNC_ALL; 70 - wbc.nr_to_write = 0; /* only write out the inode */ 71 - ret = sync_inode(inode, &wbc); 69 + ret = sync_inode_metadata(inode, 1); 72 70 if (ret) 73 71 break; 74 72 }
+5 -1
fs/ext4/ialloc.c
··· 762 762 763 763 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); 764 764 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); 765 - brelse(block_bitmap_bh); 766 765 767 766 /* recheck and clear flag under lock if we still need to */ 768 767 ext4_lock_group(sb, group); ··· 774 775 ext4_group_desc_csum_set(sb, group, gdp); 775 776 } 776 777 ext4_unlock_group(sb, group); 778 + brelse(block_bitmap_bh); 777 779 778 780 if (err) 779 781 goto fail; ··· 901 901 ext4_set_inode_state(inode, EXT4_STATE_NEW); 902 902 903 903 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 904 + 905 + ei->i_inline_off = 0; 906 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) 907 + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 904 908 905 909 ret = inode; 906 910 dquot_initialize(inode);
+3 -2
fs/ext4/indirect.c
··· 22 22 23 23 #include "ext4_jbd2.h" 24 24 #include "truncate.h" 25 + #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ 25 26 26 27 #include <trace/events/ext4.h> 27 28 ··· 756 755 partial--; 757 756 } 758 757 out: 759 - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 760 - map->m_pblk, map->m_len, err); 758 + trace_ext4_ind_map_blocks_exit(inode, map, err); 761 759 return err; 762 760 } 763 761 ··· 1412 1412 down_write(&ei->i_data_sem); 1413 1413 1414 1414 ext4_discard_preallocations(inode); 1415 + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); 1415 1416 1416 1417 /* 1417 1418 * The orphan list entry will now protect us from any crash which
+1884
fs/ext4/inline.c
··· 1 + /* 2 + * Copyright (c) 2012 Taobao. 3 + * Written by Tao Ma <boyu.mt@taobao.com> 4 + * 5 + * This program is free software; you can redistribute it and/or modify it 6 + * under the terms of version 2.1 of the GNU Lesser General Public License 7 + * as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 + * GNU General Public License for more details. 13 + */ 14 + #include "ext4_jbd2.h" 15 + #include "ext4.h" 16 + #include "xattr.h" 17 + #include "truncate.h" 18 + #include <linux/fiemap.h> 19 + 20 + #define EXT4_XATTR_SYSTEM_DATA "data" 21 + #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) 22 + #define EXT4_INLINE_DOTDOT_SIZE 4 23 + 24 + int ext4_get_inline_size(struct inode *inode) 25 + { 26 + if (EXT4_I(inode)->i_inline_off) 27 + return EXT4_I(inode)->i_inline_size; 28 + 29 + return 0; 30 + } 31 + 32 + static int get_max_inline_xattr_value_size(struct inode *inode, 33 + struct ext4_iloc *iloc) 34 + { 35 + struct ext4_xattr_ibody_header *header; 36 + struct ext4_xattr_entry *entry; 37 + struct ext4_inode *raw_inode; 38 + int free, min_offs; 39 + 40 + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - 41 + EXT4_GOOD_OLD_INODE_SIZE - 42 + EXT4_I(inode)->i_extra_isize - 43 + sizeof(struct ext4_xattr_ibody_header); 44 + 45 + /* 46 + * We need to subtract another sizeof(__u32) since an in-inode xattr 47 + * needs an empty 4 bytes to indicate the gap between the xattr entry 48 + * and the name/value pair. 49 + */ 50 + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) 51 + return EXT4_XATTR_SIZE(min_offs - 52 + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - 53 + EXT4_XATTR_ROUND - sizeof(__u32)); 54 + 55 + raw_inode = ext4_raw_inode(iloc); 56 + header = IHDR(inode, raw_inode); 57 + entry = IFIRST(header); 58 + 59 + /* Compute min_offs. */ 60 + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 61 + if (!entry->e_value_block && entry->e_value_size) { 62 + size_t offs = le16_to_cpu(entry->e_value_offs); 63 + if (offs < min_offs) 64 + min_offs = offs; 65 + } 66 + } 67 + free = min_offs - 68 + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); 69 + 70 + if (EXT4_I(inode)->i_inline_off) { 71 + entry = (struct ext4_xattr_entry *) 72 + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 + 74 + free += le32_to_cpu(entry->e_value_size); 75 + goto out; 76 + } 77 + 78 + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); 79 + 80 + if (free > EXT4_XATTR_ROUND) 81 + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); 82 + else 83 + free = 0; 84 + 85 + out: 86 + return free; 87 + } 88 + 89 + /* 90 + * Get the maximum size we now can store in an inode. 91 + * If we can't find the space for a xattr entry, don't use the space 92 + * of the extents since we have no space to indicate the inline data. 93 + */ 94 + int ext4_get_max_inline_size(struct inode *inode) 95 + { 96 + int error, max_inline_size; 97 + struct ext4_iloc iloc; 98 + 99 + if (EXT4_I(inode)->i_extra_isize == 0) 100 + return 0; 101 + 102 + error = ext4_get_inode_loc(inode, &iloc); 103 + if (error) { 104 + ext4_error_inode(inode, __func__, __LINE__, 0, 105 + "can't get inode location %lu", 106 + inode->i_ino); 107 + return 0; 108 + } 109 + 110 + down_read(&EXT4_I(inode)->xattr_sem); 111 + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); 112 + up_read(&EXT4_I(inode)->xattr_sem); 113 + 114 + brelse(iloc.bh); 115 + 116 + if (!max_inline_size) 117 + return 0; 118 + 119 + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; 120 + } 121 + 122 + int ext4_has_inline_data(struct inode *inode) 123 + { 124 + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && 125 + EXT4_I(inode)->i_inline_off; 126 + } 127 + 128 + /* 129 + * this function does not take xattr_sem, which is OK because it is 130 + * currently only used in a code path coming form ext4_iget, before 131 + * the new inode has been unlocked 132 + */ 133 + int ext4_find_inline_data_nolock(struct inode *inode) 134 + { 135 + struct ext4_xattr_ibody_find is = { 136 + .s = { .not_found = -ENODATA, }, 137 + }; 138 + struct ext4_xattr_info i = { 139 + .name_index = EXT4_XATTR_INDEX_SYSTEM, 140 + .name = EXT4_XATTR_SYSTEM_DATA, 141 + }; 142 + int error; 143 + 144 + if (EXT4_I(inode)->i_extra_isize == 0) 145 + return 0; 146 + 147 + error = ext4_get_inode_loc(inode, &is.iloc); 148 + if (error) 149 + return error; 150 + 151 + error = ext4_xattr_ibody_find(inode, &i, &is); 152 + if (error) 153 + goto out; 154 + 155 + if (!is.s.not_found) { 156 + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - 157 + (void *)ext4_raw_inode(&is.iloc)); 158 + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + 159 + le32_to_cpu(is.s.here->e_value_size); 160 + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 161 + } 162 + out: 163 + brelse(is.iloc.bh); 164 + return error; 165 + } 166 + 167 + static int ext4_read_inline_data(struct inode *inode, void *buffer, 168 + unsigned int len, 169 + struct ext4_iloc *iloc) 170 + { 171 + struct ext4_xattr_entry *entry; 172 + struct ext4_xattr_ibody_header *header; 173 + int cp_len = 0; 174 + struct ext4_inode *raw_inode; 175 + 176 + if (!len) 177 + return 0; 178 + 179 + BUG_ON(len > EXT4_I(inode)->i_inline_size); 180 + 181 + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? 182 + len : EXT4_MIN_INLINE_DATA_SIZE; 183 + 184 + raw_inode = ext4_raw_inode(iloc); 185 + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); 186 + 187 + len -= cp_len; 188 + buffer += cp_len; 189 + 190 + if (!len) 191 + goto out; 192 + 193 + header = IHDR(inode, raw_inode); 194 + entry = (struct ext4_xattr_entry *)((void *)raw_inode + 195 + EXT4_I(inode)->i_inline_off); 196 + len = min_t(unsigned int, len, 197 + (unsigned int)le32_to_cpu(entry->e_value_size)); 198 + 199 + memcpy(buffer, 200 + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); 201 + cp_len += len; 202 + 203 + out: 204 + return cp_len; 205 + } 206 + 207 + /* 208 + * write the buffer to the inline inode. 209 + * If 'create' is set, we don't need to do the extra copy in the xattr 210 + * value since it is already handled by ext4_xattr_ibody_inline_set. 211 + * That saves us one memcpy. 212 + */ 213 + void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, 214 + void *buffer, loff_t pos, unsigned int len) 215 + { 216 + struct ext4_xattr_entry *entry; 217 + struct ext4_xattr_ibody_header *header; 218 + struct ext4_inode *raw_inode; 219 + int cp_len = 0; 220 + 221 + BUG_ON(!EXT4_I(inode)->i_inline_off); 222 + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); 223 + 224 + raw_inode = ext4_raw_inode(iloc); 225 + buffer += pos; 226 + 227 + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { 228 + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? 229 + EXT4_MIN_INLINE_DATA_SIZE - pos : len; 230 + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); 231 + 232 + len -= cp_len; 233 + buffer += cp_len; 234 + pos += cp_len; 235 + } 236 + 237 + if (!len) 238 + return; 239 + 240 + pos -= EXT4_MIN_INLINE_DATA_SIZE; 241 + header = IHDR(inode, raw_inode); 242 + entry = (struct ext4_xattr_entry *)((void *)raw_inode + 243 + EXT4_I(inode)->i_inline_off); 244 + 245 + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, 246 + buffer, len); 247 + } 248 + 249 + static int ext4_create_inline_data(handle_t *handle, 250 + struct inode *inode, unsigned len) 251 + { 252 + int error; 253 + void *value = NULL; 254 + struct ext4_xattr_ibody_find is = { 255 + .s = { .not_found = -ENODATA, }, 256 + }; 257 + struct ext4_xattr_info i = { 258 + .name_index = EXT4_XATTR_INDEX_SYSTEM, 259 + .name = EXT4_XATTR_SYSTEM_DATA, 260 + }; 261 + 262 + error = ext4_get_inode_loc(inode, &is.iloc); 263 + if (error) 264 + return error; 265 + 266 + error = ext4_journal_get_write_access(handle, is.iloc.bh); 267 + if (error) 268 + goto out; 269 + 270 + if (len > EXT4_MIN_INLINE_DATA_SIZE) { 271 + value = EXT4_ZERO_XATTR_VALUE; 272 + len -= EXT4_MIN_INLINE_DATA_SIZE; 273 + } else { 274 + value = ""; 275 + len = 0; 276 + } 277 + 278 + /* Insert the the xttr entry. */ 279 + i.value = value; 280 + i.value_len = len; 281 + 282 + error = ext4_xattr_ibody_find(inode, &i, &is); 283 + if (error) 284 + goto out; 285 + 286 + BUG_ON(!is.s.not_found); 287 + 288 + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 289 + if (error) { 290 + if (error == -ENOSPC) 291 + ext4_clear_inode_state(inode, 292 + EXT4_STATE_MAY_INLINE_DATA); 293 + goto out; 294 + } 295 + 296 + memset((void *)ext4_raw_inode(&is.iloc)->i_block, 297 + 0, EXT4_MIN_INLINE_DATA_SIZE); 298 + 299 + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - 300 + (void *)ext4_raw_inode(&is.iloc)); 301 + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; 302 + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); 303 + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); 304 + get_bh(is.iloc.bh); 305 + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 306 + 307 + out: 308 + brelse(is.iloc.bh); 309 + return error; 310 + } 311 + 312 + static int ext4_update_inline_data(handle_t *handle, struct inode *inode, 313 + unsigned int len) 314 + { 315 + int error; 316 + void *value = NULL; 317 + struct ext4_xattr_ibody_find is = { 318 + .s = { .not_found = -ENODATA, }, 319 + }; 320 + struct ext4_xattr_info i = { 321 + .name_index = EXT4_XATTR_INDEX_SYSTEM, 322 + .name = EXT4_XATTR_SYSTEM_DATA, 323 + }; 324 + 325 + /* If the old space is ok, write the data directly. */ 326 + if (len <= EXT4_I(inode)->i_inline_size) 327 + return 0; 328 + 329 + error = ext4_get_inode_loc(inode, &is.iloc); 330 + if (error) 331 + return error; 332 + 333 + error = ext4_xattr_ibody_find(inode, &i, &is); 334 + if (error) 335 + goto out; 336 + 337 + BUG_ON(is.s.not_found); 338 + 339 + len -= EXT4_MIN_INLINE_DATA_SIZE; 340 + value = kzalloc(len, GFP_NOFS); 341 + if (!value) 342 + goto out; 343 + 344 + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, 345 + value, len); 346 + if (error == -ENODATA) 347 + goto out; 348 + 349 + error = ext4_journal_get_write_access(handle, is.iloc.bh); 350 + if (error) 351 + goto out; 352 + 353 + /* Update the xttr entry. */ 354 + i.value = value; 355 + i.value_len = len; 356 + 357 + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 358 + if (error) 359 + goto out; 360 + 361 + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - 362 + (void *)ext4_raw_inode(&is.iloc)); 363 + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + 364 + le32_to_cpu(is.s.here->e_value_size); 365 + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 366 + get_bh(is.iloc.bh); 367 + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 368 + 369 + out: 370 + kfree(value); 371 + brelse(is.iloc.bh); 372 + return error; 373 + } 374 + 375 + int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, 376 + unsigned int len) 377 + { 378 + int ret, size; 379 + struct ext4_inode_info *ei = EXT4_I(inode); 380 + 381 + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) 382 + return -ENOSPC; 383 + 384 + size = ext4_get_max_inline_size(inode); 385 + if (size < len) 386 + return -ENOSPC; 387 + 388 + down_write(&EXT4_I(inode)->xattr_sem); 389 + 390 + if (ei->i_inline_off) 391 + ret = ext4_update_inline_data(handle, inode, len); 392 + else 393 + ret = ext4_create_inline_data(handle, inode, len); 394 + 395 + up_write(&EXT4_I(inode)->xattr_sem); 396 + 397 + return ret; 398 + } 399 + 400 + static int ext4_destroy_inline_data_nolock(handle_t *handle, 401 + struct inode *inode) 402 + { 403 + struct ext4_inode_info *ei = EXT4_I(inode); 404 + struct ext4_xattr_ibody_find is = { 405 + .s = { .not_found = 0, }, 406 + }; 407 + struct ext4_xattr_info i = { 408 + .name_index = EXT4_XATTR_INDEX_SYSTEM, 409 + .name = EXT4_XATTR_SYSTEM_DATA, 410 + .value = NULL, 411 + .value_len = 0, 412 + }; 413 + int error; 414 + 415 + if (!ei->i_inline_off) 416 + return 0; 417 + 418 + error = ext4_get_inode_loc(inode, &is.iloc); 419 + if (error) 420 + return error; 421 + 422 + error = ext4_xattr_ibody_find(inode, &i, &is); 423 + if (error) 424 + goto out; 425 + 426 + error = ext4_journal_get_write_access(handle, is.iloc.bh); 427 + if (error) 428 + goto out; 429 + 430 + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); 431 + if (error) 432 + goto out; 433 + 434 + memset((void *)ext4_raw_inode(&is.iloc)->i_block, 435 + 0, EXT4_MIN_INLINE_DATA_SIZE); 436 + 437 + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 438 + EXT4_FEATURE_INCOMPAT_EXTENTS)) { 439 + if (S_ISDIR(inode->i_mode) || 440 + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { 441 + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); 442 + ext4_ext_tree_init(handle, inode); 443 + } 444 + } 445 + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); 446 + 447 + get_bh(is.iloc.bh); 448 + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 449 + 450 + EXT4_I(inode)->i_inline_off = 0; 451 + EXT4_I(inode)->i_inline_size = 0; 452 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 453 + out: 454 + brelse(is.iloc.bh); 455 + if (error == -ENODATA) 456 + error = 0; 457 + return error; 458 + } 459 + 460 + static int ext4_read_inline_page(struct inode *inode, struct page *page) 461 + { 462 + void *kaddr; 463 + int ret = 0; 464 + size_t len; 465 + struct ext4_iloc iloc; 466 + 467 + BUG_ON(!PageLocked(page)); 468 + BUG_ON(!ext4_has_inline_data(inode)); 469 + BUG_ON(page->index); 470 + 471 + if (!EXT4_I(inode)->i_inline_off) { 472 + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", 473 + inode->i_ino); 474 + goto out; 475 + } 476 + 477 + ret = ext4_get_inode_loc(inode, &iloc); 478 + if (ret) 479 + goto out; 480 + 481 + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); 482 + kaddr = kmap_atomic(page); 483 + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); 484 + flush_dcache_page(page); 485 + kunmap_atomic(kaddr); 486 + zero_user_segment(page, len, PAGE_CACHE_SIZE); 487 + SetPageUptodate(page); 488 + brelse(iloc.bh); 489 + 490 + out: 491 + return ret; 492 + } 493 + 494 + int ext4_readpage_inline(struct inode *inode, struct page *page) 495 + { 496 + int ret = 0; 497 + 498 + down_read(&EXT4_I(inode)->xattr_sem); 499 + if (!ext4_has_inline_data(inode)) { 500 + up_read(&EXT4_I(inode)->xattr_sem); 501 + return -EAGAIN; 502 + } 503 + 504 + /* 505 + * Current inline data can only exist in the 1st page, 506 + * So for all the other pages, just set them uptodate. 507 + */ 508 + if (!page->index) 509 + ret = ext4_read_inline_page(inode, page); 510 + else if (!PageUptodate(page)) { 511 + zero_user_segment(page, 0, PAGE_CACHE_SIZE); 512 + SetPageUptodate(page); 513 + } 514 + 515 + up_read(&EXT4_I(inode)->xattr_sem); 516 + 517 + unlock_page(page); 518 + return ret >= 0 ? 0 : ret; 519 + } 520 + 521 + static int ext4_convert_inline_data_to_extent(struct address_space *mapping, 522 + struct inode *inode, 523 + unsigned flags) 524 + { 525 + int ret, needed_blocks; 526 + handle_t *handle = NULL; 527 + int retries = 0, sem_held = 0; 528 + struct page *page = NULL; 529 + unsigned from, to; 530 + struct ext4_iloc iloc; 531 + 532 + if (!ext4_has_inline_data(inode)) { 533 + /* 534 + * clear the flag so that no new write 535 + * will trap here again. 536 + */ 537 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 538 + return 0; 539 + } 540 + 541 + needed_blocks = ext4_writepage_trans_blocks(inode); 542 + 543 + ret = ext4_get_inode_loc(inode, &iloc); 544 + if (ret) 545 + return ret; 546 + 547 + retry: 548 + handle = ext4_journal_start(inode, needed_blocks); 549 + if (IS_ERR(handle)) { 550 + ret = PTR_ERR(handle); 551 + handle = NULL; 552 + goto out; 553 + } 554 + 555 + /* We cannot recurse into the filesystem as the transaction is already 556 + * started */ 557 + flags |= AOP_FLAG_NOFS; 558 + 559 + page = grab_cache_page_write_begin(mapping, 0, flags); 560 + if (!page) { 561 + ret = -ENOMEM; 562 + goto out; 563 + } 564 + 565 + down_write(&EXT4_I(inode)->xattr_sem); 566 + sem_held = 1; 567 + /* If some one has already done this for us, just exit. */ 568 + if (!ext4_has_inline_data(inode)) { 569 + ret = 0; 570 + goto out; 571 + } 572 + 573 + from = 0; 574 + to = ext4_get_inline_size(inode); 575 + if (!PageUptodate(page)) { 576 + ret = ext4_read_inline_page(inode, page); 577 + if (ret < 0) 578 + goto out; 579 + } 580 + 581 + ret = ext4_destroy_inline_data_nolock(handle, inode); 582 + if (ret) 583 + goto out; 584 + 585 + if (ext4_should_dioread_nolock(inode)) 586 + ret = __block_write_begin(page, from, to, ext4_get_block_write); 587 + else 588 + ret = __block_write_begin(page, from, to, ext4_get_block); 589 + 590 + if (!ret && ext4_should_journal_data(inode)) { 591 + ret = ext4_walk_page_buffers(handle, page_buffers(page), 592 + from, to, NULL, 593 + do_journal_get_write_access); 594 + } 595 + 596 + if (ret) { 597 + unlock_page(page); 598 + page_cache_release(page); 599 + ext4_orphan_add(handle, inode); 600 + up_write(&EXT4_I(inode)->xattr_sem); 601 + sem_held = 0; 602 + ext4_journal_stop(handle); 603 + handle = NULL; 604 + ext4_truncate_failed_write(inode); 605 + /* 606 + * If truncate failed early the inode might 607 + * still be on the orphan list; we need to 608 + * make sure the inode is removed from the 609 + * orphan list in that case. 610 + */ 611 + if (inode->i_nlink) 612 + ext4_orphan_del(NULL, inode); 613 + } 614 + 615 + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 616 + goto retry; 617 + 618 + block_commit_write(page, from, to); 619 + out: 620 + if (page) { 621 + unlock_page(page); 622 + page_cache_release(page); 623 + } 624 + if (sem_held) 625 + up_write(&EXT4_I(inode)->xattr_sem); 626 + if (handle) 627 + ext4_journal_stop(handle); 628 + brelse(iloc.bh); 629 + return ret; 630 + } 631 + 632 + /* 633 + * Try to write data in the inode. 634 + * If the inode has inline data, check whether the new write can be 635 + * in the inode also. If not, create the page the handle, move the data 636 + * to the page make it update and let the later codes create extent for it. 637 + */ 638 + int ext4_try_to_write_inline_data(struct address_space *mapping, 639 + struct inode *inode, 640 + loff_t pos, unsigned len, 641 + unsigned flags, 642 + struct page **pagep) 643 + { 644 + int ret; 645 + handle_t *handle; 646 + struct page *page; 647 + struct ext4_iloc iloc; 648 + 649 + if (pos + len > ext4_get_max_inline_size(inode)) 650 + goto convert; 651 + 652 + ret = ext4_get_inode_loc(inode, &iloc); 653 + if (ret) 654 + return ret; 655 + 656 + /* 657 + * The possible write could happen in the inode, 658 + * so try to reserve the space in inode first. 659 + */ 660 + handle = ext4_journal_start(inode, 1); 661 + if (IS_ERR(handle)) { 662 + ret = PTR_ERR(handle); 663 + handle = NULL; 664 + goto out; 665 + } 666 + 667 + ret = ext4_prepare_inline_data(handle, inode, pos + len); 668 + if (ret && ret != -ENOSPC) 669 + goto out; 670 + 671 + /* We don't have space in inline inode, so convert it to extent. */ 672 + if (ret == -ENOSPC) { 673 + ext4_journal_stop(handle); 674 + brelse(iloc.bh); 675 + goto convert; 676 + } 677 + 678 + flags |= AOP_FLAG_NOFS; 679 + 680 + page = grab_cache_page_write_begin(mapping, 0, flags); 681 + if (!page) { 682 + ret = -ENOMEM; 683 + goto out; 684 + } 685 + 686 + *pagep = page; 687 + down_read(&EXT4_I(inode)->xattr_sem); 688 + if (!ext4_has_inline_data(inode)) { 689 + ret = 0; 690 + unlock_page(page); 691 + page_cache_release(page); 692 + goto out_up_read; 693 + } 694 + 695 + if (!PageUptodate(page)) { 696 + ret = ext4_read_inline_page(inode, page); 697 + if (ret < 0) 698 + goto out_up_read; 699 + } 700 + 701 + ret = 1; 702 + handle = NULL; 703 + out_up_read: 704 + up_read(&EXT4_I(inode)->xattr_sem); 705 + out: 706 + if (handle) 707 + ext4_journal_stop(handle); 708 + brelse(iloc.bh); 709 + return ret; 710 + convert: 711 + return ext4_convert_inline_data_to_extent(mapping, 712 + inode, flags); 713 + } 714 + 715 + int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, 716 + unsigned copied, struct page *page) 717 + { 718 + int ret; 719 + void *kaddr; 720 + struct ext4_iloc iloc; 721 + 722 + if (unlikely(copied < len)) { 723 + if (!PageUptodate(page)) { 724 + copied = 0; 725 + goto out; 726 + } 727 + } 728 + 729 + ret = ext4_get_inode_loc(inode, &iloc); 730 + if (ret) { 731 + ext4_std_error(inode->i_sb, ret); 732 + copied = 0; 733 + goto out; 734 + } 735 + 736 + down_write(&EXT4_I(inode)->xattr_sem); 737 + BUG_ON(!ext4_has_inline_data(inode)); 738 + 739 + kaddr = kmap_atomic(page); 740 + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); 741 + kunmap_atomic(kaddr); 742 + SetPageUptodate(page); 743 + /* clear page dirty so that writepages wouldn't work for us. */ 744 + ClearPageDirty(page); 745 + 746 + up_write(&EXT4_I(inode)->xattr_sem); 747 + brelse(iloc.bh); 748 + out: 749 + return copied; 750 + } 751 + 752 + struct buffer_head * 753 + ext4_journalled_write_inline_data(struct inode *inode, 754 + unsigned len, 755 + struct page *page) 756 + { 757 + int ret; 758 + void *kaddr; 759 + struct ext4_iloc iloc; 760 + 761 + ret = ext4_get_inode_loc(inode, &iloc); 762 + if (ret) { 763 + ext4_std_error(inode->i_sb, ret); 764 + return NULL; 765 + } 766 + 767 + down_write(&EXT4_I(inode)->xattr_sem); 768 + kaddr = kmap_atomic(page); 769 + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); 770 + kunmap_atomic(kaddr); 771 + up_write(&EXT4_I(inode)->xattr_sem); 772 + 773 + return iloc.bh; 774 + } 775 + 776 + /* 777 + * Try to make the page cache and handle ready for the inline data case. 778 + * We can call this function in 2 cases: 779 + * 1. The inode is created and the first write exceeds inline size. We can 780 + * clear the inode state safely. 781 + * 2. The inode has inline data, then we need to read the data, make it 782 + * update and dirty so that ext4_da_writepages can handle it. We don't 783 + * need to start the journal since the file's metatdata isn't changed now. 784 + */ 785 + static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, 786 + struct inode *inode, 787 + unsigned flags, 788 + void **fsdata) 789 + { 790 + int ret = 0, inline_size; 791 + struct page *page; 792 + 793 + page = grab_cache_page_write_begin(mapping, 0, flags); 794 + if (!page) 795 + return -ENOMEM; 796 + 797 + down_read(&EXT4_I(inode)->xattr_sem); 798 + if (!ext4_has_inline_data(inode)) { 799 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 800 + goto out; 801 + } 802 + 803 + inline_size = ext4_get_inline_size(inode); 804 + 805 + if (!PageUptodate(page)) { 806 + ret = ext4_read_inline_page(inode, page); 807 + if (ret < 0) 808 + goto out; 809 + } 810 + 811 + ret = __block_write_begin(page, 0, inline_size, 812 + ext4_da_get_block_prep); 813 + if (ret) { 814 + ext4_truncate_failed_write(inode); 815 + goto out; 816 + } 817 + 818 + SetPageDirty(page); 819 + SetPageUptodate(page); 820 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 821 + *fsdata = (void *)CONVERT_INLINE_DATA; 822 + 823 + out: 824 + up_read(&EXT4_I(inode)->xattr_sem); 825 + if (page) { 826 + unlock_page(page); 827 + page_cache_release(page); 828 + } 829 + return ret; 830 + } 831 + 832 + /* 833 + * Prepare the write for the inline data. 834 + * If the the data can be written into the inode, we just read 835 + * the page and make it uptodate, and start the journal. 836 + * Otherwise read the page, makes it dirty so that it can be 837 + * handle in writepages(the i_disksize update is left to the 838 + * normal ext4_da_write_end). 839 + */ 840 + int ext4_da_write_inline_data_begin(struct address_space *mapping, 841 + struct inode *inode, 842 + loff_t pos, unsigned len, 843 + unsigned flags, 844 + struct page **pagep, 845 + void **fsdata) 846 + { 847 + int ret, inline_size; 848 + handle_t *handle; 849 + struct page *page; 850 + struct ext4_iloc iloc; 851 + 852 + ret = ext4_get_inode_loc(inode, &iloc); 853 + if (ret) 854 + return ret; 855 + 856 + handle = ext4_journal_start(inode, 1); 857 + if (IS_ERR(handle)) { 858 + ret = PTR_ERR(handle); 859 + handle = NULL; 860 + goto out; 861 + } 862 + 863 + inline_size = ext4_get_max_inline_size(inode); 864 + 865 + ret = -ENOSPC; 866 + if (inline_size >= pos + len) { 867 + ret = ext4_prepare_inline_data(handle, inode, pos + len); 868 + if (ret && ret != -ENOSPC) 869 + goto out; 870 + } 871 + 872 + if (ret == -ENOSPC) { 873 + ret = ext4_da_convert_inline_data_to_extent(mapping, 874 + inode, 875 + flags, 876 + fsdata); 877 + goto out; 878 + } 879 + 880 + /* 881 + * We cannot recurse into the filesystem as the transaction 882 + * is already started. 883 + */ 884 + flags |= AOP_FLAG_NOFS; 885 + 886 + page = grab_cache_page_write_begin(mapping, 0, flags); 887 + if (!page) { 888 + ret = -ENOMEM; 889 + goto out; 890 + } 891 + 892 + down_read(&EXT4_I(inode)->xattr_sem); 893 + if (!ext4_has_inline_data(inode)) { 894 + ret = 0; 895 + goto out_release_page; 896 + } 897 + 898 + if (!PageUptodate(page)) { 899 + ret = ext4_read_inline_page(inode, page); 900 + if (ret < 0) 901 + goto out_release_page; 902 + } 903 + 904 + up_read(&EXT4_I(inode)->xattr_sem); 905 + *pagep = page; 906 + handle = NULL; 907 + brelse(iloc.bh); 908 + return 1; 909 + out_release_page: 910 + up_read(&EXT4_I(inode)->xattr_sem); 911 + unlock_page(page); 912 + page_cache_release(page); 913 + out: 914 + if (handle) 915 + ext4_journal_stop(handle); 916 + brelse(iloc.bh); 917 + return ret; 918 + } 919 + 920 + int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, 921 + unsigned len, unsigned copied, 922 + struct page *page) 923 + { 924 + int i_size_changed = 0; 925 + 926 + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); 927 + 928 + /* 929 + * No need to use i_size_read() here, the i_size 930 + * cannot change under us because we hold i_mutex. 931 + * 932 + * But it's important to update i_size while still holding page lock: 933 + * page writeout could otherwise come in and zero beyond i_size. 934 + */ 935 + if (pos+copied > inode->i_size) { 936 + i_size_write(inode, pos+copied); 937 + i_size_changed = 1; 938 + } 939 + unlock_page(page); 940 + page_cache_release(page); 941 + 942 + /* 943 + * Don't mark the inode dirty under page lock. First, it unnecessarily 944 + * makes the holding time of page lock longer. Second, it forces lock 945 + * ordering of page lock and transaction start for journaling 946 + * filesystems. 947 + */ 948 + if (i_size_changed) 949 + mark_inode_dirty(inode); 950 + 951 + return copied; 952 + } 953 + 954 + #ifdef INLINE_DIR_DEBUG 955 + void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, 956 + void *inline_start, int inline_size) 957 + { 958 + int offset; 959 + unsigned short de_len; 960 + struct ext4_dir_entry_2 *de = inline_start; 961 + void *dlimit = inline_start + inline_size; 962 + 963 + trace_printk("inode %lu\n", dir->i_ino); 964 + offset = 0; 965 + while ((void *)de < dlimit) { 966 + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); 967 + trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", 968 + offset, de_len, de->name_len, de->name, 969 + de->name_len, le32_to_cpu(de->inode)); 970 + if (ext4_check_dir_entry(dir, NULL, de, bh, 971 + inline_start, inline_size, offset)) 972 + BUG(); 973 + 974 + offset += de_len; 975 + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); 976 + } 977 + } 978 + #else 979 + #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) 980 + #endif 981 + 982 + /* 983 + * Add a new entry into a inline dir. 984 + * It will return -ENOSPC if no space is available, and -EIO 985 + * and -EEXIST if directory entry already exists. 986 + */ 987 + static int ext4_add_dirent_to_inline(handle_t *handle, 988 + struct dentry *dentry, 989 + struct inode *inode, 990 + struct ext4_iloc *iloc, 991 + void *inline_start, int inline_size) 992 + { 993 + struct inode *dir = dentry->d_parent->d_inode; 994 + const char *name = dentry->d_name.name; 995 + int namelen = dentry->d_name.len; 996 + unsigned short reclen; 997 + int err; 998 + struct ext4_dir_entry_2 *de; 999 + 1000 + reclen = EXT4_DIR_REC_LEN(namelen); 1001 + err = ext4_find_dest_de(dir, inode, iloc->bh, 1002 + inline_start, inline_size, 1003 + name, namelen, &de); 1004 + if (err) 1005 + return err; 1006 + 1007 + err = ext4_journal_get_write_access(handle, iloc->bh); 1008 + if (err) 1009 + return err; 1010 + ext4_insert_dentry(inode, de, inline_size, name, namelen); 1011 + 1012 + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); 1013 + 1014 + /* 1015 + * XXX shouldn't update any times until successful 1016 + * completion of syscall, but too many callers depend 1017 + * on this. 1018 + * 1019 + * XXX similarly, too many callers depend on 1020 + * ext4_new_inode() setting the times, but error 1021 + * recovery deletes the inode, so the worst that can 1022 + * happen is that the times are slightly out of date 1023 + * and/or different from the directory change time. 1024 + */ 1025 + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); 1026 + ext4_update_dx_flag(dir); 1027 + dir->i_version++; 1028 + ext4_mark_inode_dirty(handle, dir); 1029 + return 1; 1030 + } 1031 + 1032 + static void *ext4_get_inline_xattr_pos(struct inode *inode, 1033 + struct ext4_iloc *iloc) 1034 + { 1035 + struct ext4_xattr_entry *entry; 1036 + struct ext4_xattr_ibody_header *header; 1037 + 1038 + BUG_ON(!EXT4_I(inode)->i_inline_off); 1039 + 1040 + header = IHDR(inode, ext4_raw_inode(iloc)); 1041 + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + 1042 + EXT4_I(inode)->i_inline_off); 1043 + 1044 + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); 1045 + } 1046 + 1047 + /* Set the final de to cover the whole block. */ 1048 + static void ext4_update_final_de(void *de_buf, int old_size, int new_size) 1049 + { 1050 + struct ext4_dir_entry_2 *de, *prev_de; 1051 + void *limit; 1052 + int de_len; 1053 + 1054 + de = (struct ext4_dir_entry_2 *)de_buf; 1055 + if (old_size) { 1056 + limit = de_buf + old_size; 1057 + do { 1058 + prev_de = de; 1059 + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); 1060 + de_buf += de_len; 1061 + de = (struct ext4_dir_entry_2 *)de_buf; 1062 + } while (de_buf < limit); 1063 + 1064 + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - 1065 + old_size, new_size); 1066 + } else { 1067 + /* this is just created, so create an empty entry. */ 1068 + de->inode = 0; 1069 + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); 1070 + } 1071 + } 1072 + 1073 + static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, 1074 + struct ext4_iloc *iloc) 1075 + { 1076 + int ret; 1077 + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; 1078 + int new_size = get_max_inline_xattr_value_size(dir, iloc); 1079 + 1080 + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) 1081 + return -ENOSPC; 1082 + 1083 + ret = ext4_update_inline_data(handle, dir, 1084 + new_size + EXT4_MIN_INLINE_DATA_SIZE); 1085 + if (ret) 1086 + return ret; 1087 + 1088 + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, 1089 + EXT4_I(dir)->i_inline_size - 1090 + EXT4_MIN_INLINE_DATA_SIZE); 1091 + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; 1092 + return 0; 1093 + } 1094 + 1095 + static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, 1096 + struct ext4_iloc *iloc, 1097 + void *buf, int inline_size) 1098 + { 1099 + ext4_create_inline_data(handle, inode, inline_size); 1100 + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); 1101 + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1102 + } 1103 + 1104 + static int ext4_finish_convert_inline_dir(handle_t *handle, 1105 + struct inode *inode, 1106 + struct buffer_head *dir_block, 1107 + void *buf, 1108 + int inline_size) 1109 + { 1110 + int err, csum_size = 0, header_size = 0; 1111 + struct ext4_dir_entry_2 *de; 1112 + struct ext4_dir_entry_tail *t; 1113 + void *target = dir_block->b_data; 1114 + 1115 + /* 1116 + * First create "." and ".." and then copy the dir information 1117 + * back to the block. 1118 + */ 1119 + de = (struct ext4_dir_entry_2 *)target; 1120 + de = ext4_init_dot_dotdot(inode, de, 1121 + inode->i_sb->s_blocksize, csum_size, 1122 + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); 1123 + header_size = (void *)de - target; 1124 + 1125 + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, 1126 + inline_size - EXT4_INLINE_DOTDOT_SIZE); 1127 + 1128 + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1129 + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1130 + csum_size = sizeof(struct ext4_dir_entry_tail); 1131 + 1132 + inode->i_size = inode->i_sb->s_blocksize; 1133 + i_size_write(inode, inode->i_sb->s_blocksize); 1134 + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1135 + ext4_update_final_de(dir_block->b_data, 1136 + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, 1137 + inode->i_sb->s_blocksize - csum_size); 1138 + 1139 + if (csum_size) { 1140 + t = EXT4_DIRENT_TAIL(dir_block->b_data, 1141 + inode->i_sb->s_blocksize); 1142 + initialize_dirent_tail(t, inode->i_sb->s_blocksize); 1143 + } 1144 + set_buffer_uptodate(dir_block); 1145 + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); 1146 + if (err) 1147 + goto out; 1148 + set_buffer_verified(dir_block); 1149 + out: 1150 + return err; 1151 + } 1152 + 1153 + static int ext4_convert_inline_data_nolock(handle_t *handle, 1154 + struct inode *inode, 1155 + struct ext4_iloc *iloc) 1156 + { 1157 + int error; 1158 + void *buf = NULL; 1159 + struct buffer_head *data_bh = NULL; 1160 + struct ext4_map_blocks map; 1161 + int inline_size; 1162 + 1163 + inline_size = ext4_get_inline_size(inode); 1164 + buf = kmalloc(inline_size, GFP_NOFS); 1165 + if (!buf) { 1166 + error = -ENOMEM; 1167 + goto out; 1168 + } 1169 + 1170 + error = ext4_read_inline_data(inode, buf, inline_size, iloc); 1171 + if (error < 0) 1172 + goto out; 1173 + 1174 + error = ext4_destroy_inline_data_nolock(handle, inode); 1175 + if (error) 1176 + goto out; 1177 + 1178 + map.m_lblk = 0; 1179 + map.m_len = 1; 1180 + map.m_flags = 0; 1181 + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); 1182 + if (error < 0) 1183 + goto out_restore; 1184 + if (!(map.m_flags & EXT4_MAP_MAPPED)) { 1185 + error = -EIO; 1186 + goto out_restore; 1187 + } 1188 + 1189 + data_bh = sb_getblk(inode->i_sb, map.m_pblk); 1190 + if (!data_bh) { 1191 + error = -EIO; 1192 + goto out_restore; 1193 + } 1194 + 1195 + lock_buffer(data_bh); 1196 + error = ext4_journal_get_create_access(handle, data_bh); 1197 + if (error) { 1198 + unlock_buffer(data_bh); 1199 + error = -EIO; 1200 + goto out_restore; 1201 + } 1202 + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); 1203 + 1204 + if (!S_ISDIR(inode->i_mode)) { 1205 + memcpy(data_bh->b_data, buf, inline_size); 1206 + set_buffer_uptodate(data_bh); 1207 + error = ext4_handle_dirty_metadata(handle, 1208 + inode, data_bh); 1209 + } else { 1210 + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, 1211 + buf, inline_size); 1212 + } 1213 + 1214 + unlock_buffer(data_bh); 1215 + out_restore: 1216 + if (error) 1217 + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); 1218 + 1219 + out: 1220 + brelse(data_bh); 1221 + kfree(buf); 1222 + return error; 1223 + } 1224 + 1225 + /* 1226 + * Try to add the new entry to the inline data. 1227 + * If succeeds, return 0. If not, extended the inline dir and copied data to 1228 + * the new created block. 1229 + */ 1230 + int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, 1231 + struct inode *inode) 1232 + { 1233 + int ret, inline_size; 1234 + void *inline_start; 1235 + struct ext4_iloc iloc; 1236 + struct inode *dir = dentry->d_parent->d_inode; 1237 + 1238 + ret = ext4_get_inode_loc(dir, &iloc); 1239 + if (ret) 1240 + return ret; 1241 + 1242 + down_write(&EXT4_I(dir)->xattr_sem); 1243 + if (!ext4_has_inline_data(dir)) 1244 + goto out; 1245 + 1246 + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + 1247 + EXT4_INLINE_DOTDOT_SIZE; 1248 + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; 1249 + 1250 + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, 1251 + inline_start, inline_size); 1252 + if (ret != -ENOSPC) 1253 + goto out; 1254 + 1255 + /* check whether it can be inserted to inline xattr space. */ 1256 + inline_size = EXT4_I(dir)->i_inline_size - 1257 + EXT4_MIN_INLINE_DATA_SIZE; 1258 + if (!inline_size) { 1259 + /* Try to use the xattr space.*/ 1260 + ret = ext4_update_inline_dir(handle, dir, &iloc); 1261 + if (ret && ret != -ENOSPC) 1262 + goto out; 1263 + 1264 + inline_size = EXT4_I(dir)->i_inline_size - 1265 + EXT4_MIN_INLINE_DATA_SIZE; 1266 + } 1267 + 1268 + if (inline_size) { 1269 + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); 1270 + 1271 + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, 1272 + inline_start, inline_size); 1273 + 1274 + if (ret != -ENOSPC) 1275 + goto out; 1276 + } 1277 + 1278 + /* 1279 + * The inline space is filled up, so create a new block for it. 1280 + * As the extent tree will be created, we have to save the inline 1281 + * dir first. 1282 + */ 1283 + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); 1284 + 1285 + out: 1286 + ext4_mark_inode_dirty(handle, dir); 1287 + up_write(&EXT4_I(dir)->xattr_sem); 1288 + brelse(iloc.bh); 1289 + return ret; 1290 + } 1291 + 1292 + int ext4_read_inline_dir(struct file *filp, 1293 + void *dirent, filldir_t filldir, 1294 + int *has_inline_data) 1295 + { 1296 + int error = 0; 1297 + unsigned int offset, parent_ino; 1298 + int i, stored; 1299 + struct ext4_dir_entry_2 *de; 1300 + struct super_block *sb; 1301 + struct inode *inode = filp->f_path.dentry->d_inode; 1302 + int ret, inline_size = 0; 1303 + struct ext4_iloc iloc; 1304 + void *dir_buf = NULL; 1305 + 1306 + ret = ext4_get_inode_loc(inode, &iloc); 1307 + if (ret) 1308 + return ret; 1309 + 1310 + down_read(&EXT4_I(inode)->xattr_sem); 1311 + if (!ext4_has_inline_data(inode)) { 1312 + up_read(&EXT4_I(inode)->xattr_sem); 1313 + *has_inline_data = 0; 1314 + goto out; 1315 + } 1316 + 1317 + inline_size = ext4_get_inline_size(inode); 1318 + dir_buf = kmalloc(inline_size, GFP_NOFS); 1319 + if (!dir_buf) { 1320 + ret = -ENOMEM; 1321 + up_read(&EXT4_I(inode)->xattr_sem); 1322 + goto out; 1323 + } 1324 + 1325 + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); 1326 + up_read(&EXT4_I(inode)->xattr_sem); 1327 + if (ret < 0) 1328 + goto out; 1329 + 1330 + sb = inode->i_sb; 1331 + stored = 0; 1332 + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1333 + 1334 + while (!error && !stored && filp->f_pos < inode->i_size) { 1335 + revalidate: 1336 + /* 1337 + * If the version has changed since the last call to 1338 + * readdir(2), then we might be pointing to an invalid 1339 + * dirent right now. Scan from the start of the inline 1340 + * dir to make sure. 1341 + */ 1342 + if (filp->f_version != inode->i_version) { 1343 + for (i = 0; 1344 + i < inode->i_size && i < offset;) { 1345 + if (!i) { 1346 + /* skip "." and ".." if needed. */ 1347 + i += EXT4_INLINE_DOTDOT_SIZE; 1348 + continue; 1349 + } 1350 + de = (struct ext4_dir_entry_2 *) 1351 + (dir_buf + i); 1352 + /* It's too expensive to do a full 1353 + * dirent test each time round this 1354 + * loop, but we do have to test at 1355 + * least that it is non-zero. A 1356 + * failure will be detected in the 1357 + * dirent test below. */ 1358 + if (ext4_rec_len_from_disk(de->rec_len, 1359 + inline_size) < EXT4_DIR_REC_LEN(1)) 1360 + break; 1361 + i += ext4_rec_len_from_disk(de->rec_len, 1362 + inline_size); 1363 + } 1364 + offset = i; 1365 + filp->f_pos = offset; 1366 + filp->f_version = inode->i_version; 1367 + } 1368 + 1369 + while (!error && filp->f_pos < inode->i_size) { 1370 + if (filp->f_pos == 0) { 1371 + error = filldir(dirent, ".", 1, 0, inode->i_ino, 1372 + DT_DIR); 1373 + if (error) 1374 + break; 1375 + stored++; 1376 + 1377 + error = filldir(dirent, "..", 2, 0, parent_ino, 1378 + DT_DIR); 1379 + if (error) 1380 + break; 1381 + stored++; 1382 + 1383 + filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; 1384 + continue; 1385 + } 1386 + 1387 + de = (struct ext4_dir_entry_2 *)(dir_buf + offset); 1388 + if (ext4_check_dir_entry(inode, filp, de, 1389 + iloc.bh, dir_buf, 1390 + inline_size, offset)) { 1391 + ret = stored; 1392 + goto out; 1393 + } 1394 + offset += ext4_rec_len_from_disk(de->rec_len, 1395 + inline_size); 1396 + if (le32_to_cpu(de->inode)) { 1397 + /* We might block in the next section 1398 + * if the data destination is 1399 + * currently swapped out. So, use a 1400 + * version stamp to detect whether or 1401 + * not the directory has been modified 1402 + * during the copy operation. 1403 + */ 1404 + u64 version = filp->f_version; 1405 + 1406 + error = filldir(dirent, de->name, 1407 + de->name_len, 1408 + filp->f_pos, 1409 + le32_to_cpu(de->inode), 1410 + get_dtype(sb, de->file_type)); 1411 + if (error) 1412 + break; 1413 + if (version != filp->f_version) 1414 + goto revalidate; 1415 + stored++; 1416 + } 1417 + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 1418 + inline_size); 1419 + } 1420 + offset = 0; 1421 + } 1422 + out: 1423 + kfree(dir_buf); 1424 + brelse(iloc.bh); 1425 + return ret; 1426 + } 1427 + 1428 + struct buffer_head *ext4_get_first_inline_block(struct inode *inode, 1429 + struct ext4_dir_entry_2 **parent_de, 1430 + int *retval) 1431 + { 1432 + struct ext4_iloc iloc; 1433 + 1434 + *retval = ext4_get_inode_loc(inode, &iloc); 1435 + if (*retval) 1436 + return NULL; 1437 + 1438 + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; 1439 + 1440 + return iloc.bh; 1441 + } 1442 + 1443 + /* 1444 + * Try to create the inline data for the new dir. 1445 + * If it succeeds, return 0, otherwise return the error. 1446 + * In case of ENOSPC, the caller should create the normal disk layout dir. 1447 + */ 1448 + int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, 1449 + struct inode *inode) 1450 + { 1451 + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; 1452 + struct ext4_iloc iloc; 1453 + struct ext4_dir_entry_2 *de; 1454 + 1455 + ret = ext4_get_inode_loc(inode, &iloc); 1456 + if (ret) 1457 + return ret; 1458 + 1459 + ret = ext4_prepare_inline_data(handle, inode, inline_size); 1460 + if (ret) 1461 + goto out; 1462 + 1463 + /* 1464 + * For inline dir, we only save the inode information for the ".." 1465 + * and create a fake dentry to cover the left space. 1466 + */ 1467 + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; 1468 + de->inode = cpu_to_le32(parent->i_ino); 1469 + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); 1470 + de->inode = 0; 1471 + de->rec_len = ext4_rec_len_to_disk( 1472 + inline_size - EXT4_INLINE_DOTDOT_SIZE, 1473 + inline_size); 1474 + set_nlink(inode, 2); 1475 + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; 1476 + out: 1477 + brelse(iloc.bh); 1478 + return ret; 1479 + } 1480 + 1481 + struct buffer_head *ext4_find_inline_entry(struct inode *dir, 1482 + const struct qstr *d_name, 1483 + struct ext4_dir_entry_2 **res_dir, 1484 + int *has_inline_data) 1485 + { 1486 + int ret; 1487 + struct ext4_iloc iloc; 1488 + void *inline_start; 1489 + int inline_size; 1490 + 1491 + if (ext4_get_inode_loc(dir, &iloc)) 1492 + return NULL; 1493 + 1494 + down_read(&EXT4_I(dir)->xattr_sem); 1495 + if (!ext4_has_inline_data(dir)) { 1496 + *has_inline_data = 0; 1497 + goto out; 1498 + } 1499 + 1500 + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + 1501 + EXT4_INLINE_DOTDOT_SIZE; 1502 + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; 1503 + ret = search_dir(iloc.bh, inline_start, inline_size, 1504 + dir, d_name, 0, res_dir); 1505 + if (ret == 1) 1506 + goto out_find; 1507 + if (ret < 0) 1508 + goto out; 1509 + 1510 + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) 1511 + goto out; 1512 + 1513 + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); 1514 + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; 1515 + 1516 + ret = search_dir(iloc.bh, inline_start, inline_size, 1517 + dir, d_name, 0, res_dir); 1518 + if (ret == 1) 1519 + goto out_find; 1520 + 1521 + out: 1522 + brelse(iloc.bh); 1523 + iloc.bh = NULL; 1524 + out_find: 1525 + up_read(&EXT4_I(dir)->xattr_sem); 1526 + return iloc.bh; 1527 + } 1528 + 1529 + int ext4_delete_inline_entry(handle_t *handle, 1530 + struct inode *dir, 1531 + struct ext4_dir_entry_2 *de_del, 1532 + struct buffer_head *bh, 1533 + int *has_inline_data) 1534 + { 1535 + int err, inline_size; 1536 + struct ext4_iloc iloc; 1537 + void *inline_start; 1538 + 1539 + err = ext4_get_inode_loc(dir, &iloc); 1540 + if (err) 1541 + return err; 1542 + 1543 + down_write(&EXT4_I(dir)->xattr_sem); 1544 + if (!ext4_has_inline_data(dir)) { 1545 + *has_inline_data = 0; 1546 + goto out; 1547 + } 1548 + 1549 + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < 1550 + EXT4_MIN_INLINE_DATA_SIZE) { 1551 + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + 1552 + EXT4_INLINE_DOTDOT_SIZE; 1553 + inline_size = EXT4_MIN_INLINE_DATA_SIZE - 1554 + EXT4_INLINE_DOTDOT_SIZE; 1555 + } else { 1556 + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); 1557 + inline_size = ext4_get_inline_size(dir) - 1558 + EXT4_MIN_INLINE_DATA_SIZE; 1559 + } 1560 + 1561 + err = ext4_journal_get_write_access(handle, bh); 1562 + if (err) 1563 + goto out; 1564 + 1565 + err = ext4_generic_delete_entry(handle, dir, de_del, bh, 1566 + inline_start, inline_size, 0); 1567 + if (err) 1568 + goto out; 1569 + 1570 + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1571 + err = ext4_mark_inode_dirty(handle, dir); 1572 + if (unlikely(err)) 1573 + goto out; 1574 + 1575 + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); 1576 + out: 1577 + up_write(&EXT4_I(dir)->xattr_sem); 1578 + brelse(iloc.bh); 1579 + if (err != -ENOENT) 1580 + ext4_std_error(dir->i_sb, err); 1581 + return err; 1582 + } 1583 + 1584 + /* 1585 + * Get the inline dentry at offset. 1586 + */ 1587 + static inline struct ext4_dir_entry_2 * 1588 + ext4_get_inline_entry(struct inode *inode, 1589 + struct ext4_iloc *iloc, 1590 + unsigned int offset, 1591 + void **inline_start, 1592 + int *inline_size) 1593 + { 1594 + void *inline_pos; 1595 + 1596 + BUG_ON(offset > ext4_get_inline_size(inode)); 1597 + 1598 + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { 1599 + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; 1600 + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; 1601 + } else { 1602 + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); 1603 + offset -= EXT4_MIN_INLINE_DATA_SIZE; 1604 + *inline_size = ext4_get_inline_size(inode) - 1605 + EXT4_MIN_INLINE_DATA_SIZE; 1606 + } 1607 + 1608 + if (inline_start) 1609 + *inline_start = inline_pos; 1610 + return (struct ext4_dir_entry_2 *)(inline_pos + offset); 1611 + } 1612 + 1613 + int empty_inline_dir(struct inode *dir, int *has_inline_data) 1614 + { 1615 + int err, inline_size; 1616 + struct ext4_iloc iloc; 1617 + void *inline_pos; 1618 + unsigned int offset; 1619 + struct ext4_dir_entry_2 *de; 1620 + int ret = 1; 1621 + 1622 + err = ext4_get_inode_loc(dir, &iloc); 1623 + if (err) { 1624 + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", 1625 + err, dir->i_ino); 1626 + return 1; 1627 + } 1628 + 1629 + down_read(&EXT4_I(dir)->xattr_sem); 1630 + if (!ext4_has_inline_data(dir)) { 1631 + *has_inline_data = 0; 1632 + goto out; 1633 + } 1634 + 1635 + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; 1636 + if (!le32_to_cpu(de->inode)) { 1637 + ext4_warning(dir->i_sb, 1638 + "bad inline directory (dir #%lu) - no `..'", 1639 + dir->i_ino); 1640 + ret = 1; 1641 + goto out; 1642 + } 1643 + 1644 + offset = EXT4_INLINE_DOTDOT_SIZE; 1645 + while (offset < dir->i_size) { 1646 + de = ext4_get_inline_entry(dir, &iloc, offset, 1647 + &inline_pos, &inline_size); 1648 + if (ext4_check_dir_entry(dir, NULL, de, 1649 + iloc.bh, inline_pos, 1650 + inline_size, offset)) { 1651 + ext4_warning(dir->i_sb, 1652 + "bad inline directory (dir #%lu) - " 1653 + "inode %u, rec_len %u, name_len %d" 1654 + "inline size %d\n", 1655 + dir->i_ino, le32_to_cpu(de->inode), 1656 + le16_to_cpu(de->rec_len), de->name_len, 1657 + inline_size); 1658 + ret = 1; 1659 + goto out; 1660 + } 1661 + if (le32_to_cpu(de->inode)) { 1662 + ret = 0; 1663 + goto out; 1664 + } 1665 + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); 1666 + } 1667 + 1668 + out: 1669 + up_read(&EXT4_I(dir)->xattr_sem); 1670 + brelse(iloc.bh); 1671 + return ret; 1672 + } 1673 + 1674 + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) 1675 + { 1676 + int ret; 1677 + 1678 + down_write(&EXT4_I(inode)->xattr_sem); 1679 + ret = ext4_destroy_inline_data_nolock(handle, inode); 1680 + up_write(&EXT4_I(inode)->xattr_sem); 1681 + 1682 + return ret; 1683 + } 1684 + 1685 + int ext4_inline_data_fiemap(struct inode *inode, 1686 + struct fiemap_extent_info *fieinfo, 1687 + int *has_inline) 1688 + { 1689 + __u64 physical = 0; 1690 + __u64 length; 1691 + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; 1692 + int error = 0; 1693 + struct ext4_iloc iloc; 1694 + 1695 + down_read(&EXT4_I(inode)->xattr_sem); 1696 + if (!ext4_has_inline_data(inode)) { 1697 + *has_inline = 0; 1698 + goto out; 1699 + } 1700 + 1701 + error = ext4_get_inode_loc(inode, &iloc); 1702 + if (error) 1703 + goto out; 1704 + 1705 + physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1706 + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1707 + physical += offsetof(struct ext4_inode, i_block); 1708 + length = i_size_read(inode); 1709 + 1710 + if (physical) 1711 + error = fiemap_fill_next_extent(fieinfo, 0, physical, 1712 + length, flags); 1713 + brelse(iloc.bh); 1714 + out: 1715 + up_read(&EXT4_I(inode)->xattr_sem); 1716 + return (error < 0 ? error : 0); 1717 + } 1718 + 1719 + /* 1720 + * Called during xattr set, and if we can sparse space 'needed', 1721 + * just create the extent tree evict the data to the outer block. 1722 + * 1723 + * We use jbd2 instead of page cache to move data to the 1st block 1724 + * so that the whole transaction can be committed as a whole and 1725 + * the data isn't lost because of the delayed page cache write. 1726 + */ 1727 + int ext4_try_to_evict_inline_data(handle_t *handle, 1728 + struct inode *inode, 1729 + int needed) 1730 + { 1731 + int error; 1732 + struct ext4_xattr_entry *entry; 1733 + struct ext4_xattr_ibody_header *header; 1734 + struct ext4_inode *raw_inode; 1735 + struct ext4_iloc iloc; 1736 + 1737 + error = ext4_get_inode_loc(inode, &iloc); 1738 + if (error) 1739 + return error; 1740 + 1741 + raw_inode = ext4_raw_inode(&iloc); 1742 + header = IHDR(inode, raw_inode); 1743 + entry = (struct ext4_xattr_entry *)((void *)raw_inode + 1744 + EXT4_I(inode)->i_inline_off); 1745 + if (EXT4_XATTR_LEN(entry->e_name_len) + 1746 + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { 1747 + error = -ENOSPC; 1748 + goto out; 1749 + } 1750 + 1751 + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); 1752 + out: 1753 + brelse(iloc.bh); 1754 + return error; 1755 + } 1756 + 1757 + void ext4_inline_data_truncate(struct inode *inode, int *has_inline) 1758 + { 1759 + handle_t *handle; 1760 + int inline_size, value_len, needed_blocks; 1761 + size_t i_size; 1762 + void *value = NULL; 1763 + struct ext4_xattr_ibody_find is = { 1764 + .s = { .not_found = -ENODATA, }, 1765 + }; 1766 + struct ext4_xattr_info i = { 1767 + .name_index = EXT4_XATTR_INDEX_SYSTEM, 1768 + .name = EXT4_XATTR_SYSTEM_DATA, 1769 + }; 1770 + 1771 + 1772 + needed_blocks = ext4_writepage_trans_blocks(inode); 1773 + handle = ext4_journal_start(inode, needed_blocks); 1774 + if (IS_ERR(handle)) 1775 + return; 1776 + 1777 + down_write(&EXT4_I(inode)->xattr_sem); 1778 + if (!ext4_has_inline_data(inode)) { 1779 + *has_inline = 0; 1780 + ext4_journal_stop(handle); 1781 + return; 1782 + } 1783 + 1784 + if (ext4_orphan_add(handle, inode)) 1785 + goto out; 1786 + 1787 + if (ext4_get_inode_loc(inode, &is.iloc)) 1788 + goto out; 1789 + 1790 + down_write(&EXT4_I(inode)->i_data_sem); 1791 + i_size = inode->i_size; 1792 + inline_size = ext4_get_inline_size(inode); 1793 + EXT4_I(inode)->i_disksize = i_size; 1794 + 1795 + if (i_size < inline_size) { 1796 + /* Clear the content in the xattr space. */ 1797 + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { 1798 + if (ext4_xattr_ibody_find(inode, &i, &is)) 1799 + goto out_error; 1800 + 1801 + BUG_ON(is.s.not_found); 1802 + 1803 + value_len = le32_to_cpu(is.s.here->e_value_size); 1804 + value = kmalloc(value_len, GFP_NOFS); 1805 + if (!value) 1806 + goto out_error; 1807 + 1808 + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, 1809 + value, value_len)) 1810 + goto out_error; 1811 + 1812 + i.value = value; 1813 + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? 1814 + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; 1815 + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) 1816 + goto out_error; 1817 + } 1818 + 1819 + /* Clear the content within i_blocks. */ 1820 + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) 1821 + memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, 1822 + EXT4_MIN_INLINE_DATA_SIZE - i_size); 1823 + 1824 + EXT4_I(inode)->i_inline_size = i_size < 1825 + EXT4_MIN_INLINE_DATA_SIZE ? 1826 + EXT4_MIN_INLINE_DATA_SIZE : i_size; 1827 + } 1828 + 1829 + out_error: 1830 + up_write(&EXT4_I(inode)->i_data_sem); 1831 + out: 1832 + brelse(is.iloc.bh); 1833 + up_write(&EXT4_I(inode)->xattr_sem); 1834 + kfree(value); 1835 + if (inode->i_nlink) 1836 + ext4_orphan_del(handle, inode); 1837 + 1838 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 1839 + ext4_mark_inode_dirty(handle, inode); 1840 + if (IS_SYNC(inode)) 1841 + ext4_handle_sync(handle); 1842 + 1843 + ext4_journal_stop(handle); 1844 + return; 1845 + } 1846 + 1847 + int ext4_convert_inline_data(struct inode *inode) 1848 + { 1849 + int error, needed_blocks; 1850 + handle_t *handle; 1851 + struct ext4_iloc iloc; 1852 + 1853 + if (!ext4_has_inline_data(inode)) { 1854 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1855 + return 0; 1856 + } 1857 + 1858 + needed_blocks = ext4_writepage_trans_blocks(inode); 1859 + 1860 + iloc.bh = NULL; 1861 + error = ext4_get_inode_loc(inode, &iloc); 1862 + if (error) 1863 + return error; 1864 + 1865 + handle = ext4_journal_start(inode, needed_blocks); 1866 + if (IS_ERR(handle)) { 1867 + error = PTR_ERR(handle); 1868 + goto out_free; 1869 + } 1870 + 1871 + down_write(&EXT4_I(inode)->xattr_sem); 1872 + if (!ext4_has_inline_data(inode)) { 1873 + up_write(&EXT4_I(inode)->xattr_sem); 1874 + goto out; 1875 + } 1876 + 1877 + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); 1878 + up_write(&EXT4_I(inode)->xattr_sem); 1879 + out: 1880 + ext4_journal_stop(handle); 1881 + out_free: 1882 + brelse(iloc.bh); 1883 + return error; 1884 + }
+362 -277
fs/ext4/inode.c
··· 484 484 } 485 485 486 486 /* 487 - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. 488 - */ 489 - static void set_buffers_da_mapped(struct inode *inode, 490 - struct ext4_map_blocks *map) 491 - { 492 - struct address_space *mapping = inode->i_mapping; 493 - struct pagevec pvec; 494 - int i, nr_pages; 495 - pgoff_t index, end; 496 - 497 - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 498 - end = (map->m_lblk + map->m_len - 1) >> 499 - (PAGE_CACHE_SHIFT - inode->i_blkbits); 500 - 501 - pagevec_init(&pvec, 0); 502 - while (index <= end) { 503 - nr_pages = pagevec_lookup(&pvec, mapping, index, 504 - min(end - index + 1, 505 - (pgoff_t)PAGEVEC_SIZE)); 506 - if (nr_pages == 0) 507 - break; 508 - for (i = 0; i < nr_pages; i++) { 509 - struct page *page = pvec.pages[i]; 510 - struct buffer_head *bh, *head; 511 - 512 - if (unlikely(page->mapping != mapping) || 513 - !PageDirty(page)) 514 - break; 515 - 516 - if (page_has_buffers(page)) { 517 - bh = head = page_buffers(page); 518 - do { 519 - set_buffer_da_mapped(bh); 520 - bh = bh->b_this_page; 521 - } while (bh != head); 522 - } 523 - index++; 524 - } 525 - pagevec_release(&pvec); 526 - } 527 - } 528 - 529 - /* 530 487 * The ext4_map_blocks() function tries to look up the requested blocks, 531 488 * and returns if the blocks are already mapped. 532 489 * ··· 531 574 up_read((&EXT4_I(inode)->i_data_sem)); 532 575 533 576 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 534 - int ret = check_block_validity(inode, map); 577 + int ret; 578 + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 579 + /* delayed alloc may be allocated by fallocate and 580 + * coverted to initialized by directIO. 581 + * we need to handle delayed extent here. 582 + */ 583 + down_write((&EXT4_I(inode)->i_data_sem)); 584 + goto delayed_mapped; 585 + } 586 + ret = check_block_validity(inode, map); 535 587 if (ret != 0) 536 588 return ret; 537 589 } ··· 618 652 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 619 653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 620 654 621 - /* If we have successfully mapped the delayed allocated blocks, 622 - * set the BH_Da_Mapped bit on them. Its important to do this 623 - * under the protection of i_data_sem. 624 - */ 625 - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 626 - set_buffers_da_mapped(inode, map); 655 + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 656 + int ret; 657 + delayed_mapped: 658 + /* delayed allocation blocks has been allocated */ 659 + ret = ext4_es_remove_extent(inode, map->m_lblk, 660 + map->m_len); 661 + if (ret < 0) 662 + retval = ret; 663 + } 627 664 } 628 665 629 666 up_write((&EXT4_I(inode)->i_data_sem)); ··· 649 680 int ret = 0, started = 0; 650 681 int dio_credits; 651 682 683 + if (ext4_has_inline_data(inode)) 684 + return -ERANGE; 685 + 652 686 map.m_lblk = iblock; 653 687 map.m_len = bh->b_size >> inode->i_blkbits; 654 688 655 - if (flags && !handle) { 689 + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { 656 690 /* Direct IO write... */ 657 691 if (map.m_len > DIO_MAX_BLOCKS) 658 692 map.m_len = DIO_MAX_BLOCKS; ··· 770 798 return NULL; 771 799 } 772 800 773 - static int walk_page_buffers(handle_t *handle, 774 - struct buffer_head *head, 775 - unsigned from, 776 - unsigned to, 777 - int *partial, 778 - int (*fn)(handle_t *handle, 779 - struct buffer_head *bh)) 801 + int ext4_walk_page_buffers(handle_t *handle, 802 + struct buffer_head *head, 803 + unsigned from, 804 + unsigned to, 805 + int *partial, 806 + int (*fn)(handle_t *handle, 807 + struct buffer_head *bh)) 780 808 { 781 809 struct buffer_head *bh; 782 810 unsigned block_start, block_end; ··· 826 854 * is elevated. We'll still have enough credits for the tiny quotafile 827 855 * write. 828 856 */ 829 - static int do_journal_get_write_access(handle_t *handle, 830 - struct buffer_head *bh) 857 + int do_journal_get_write_access(handle_t *handle, 858 + struct buffer_head *bh) 831 859 { 832 860 int dirty = buffer_dirty(bh); 833 861 int ret; ··· 850 878 return ret; 851 879 } 852 880 853 - static int ext4_get_block_write(struct inode *inode, sector_t iblock, 881 + static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 854 882 struct buffer_head *bh_result, int create); 855 883 static int ext4_write_begin(struct file *file, struct address_space *mapping, 856 884 loff_t pos, unsigned len, unsigned flags, ··· 874 902 from = pos & (PAGE_CACHE_SIZE - 1); 875 903 to = from + len; 876 904 905 + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 906 + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 907 + flags, pagep); 908 + if (ret < 0) 909 + goto out; 910 + if (ret == 1) { 911 + ret = 0; 912 + goto out; 913 + } 914 + } 915 + 877 916 retry: 878 917 handle = ext4_journal_start(inode, needed_blocks); 879 918 if (IS_ERR(handle)) { ··· 902 919 ret = -ENOMEM; 903 920 goto out; 904 921 } 922 + 905 923 *pagep = page; 906 924 907 925 if (ext4_should_dioread_nolock(inode)) ··· 911 927 ret = __block_write_begin(page, pos, len, ext4_get_block); 912 928 913 929 if (!ret && ext4_should_journal_data(inode)) { 914 - ret = walk_page_buffers(handle, page_buffers(page), 915 - from, to, NULL, do_journal_get_write_access); 930 + ret = ext4_walk_page_buffers(handle, page_buffers(page), 931 + from, to, NULL, 932 + do_journal_get_write_access); 916 933 } 917 934 918 935 if (ret) { ··· 968 983 struct inode *inode = mapping->host; 969 984 handle_t *handle = ext4_journal_current_handle(); 970 985 971 - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 986 + if (ext4_has_inline_data(inode)) 987 + copied = ext4_write_inline_data_end(inode, pos, len, 988 + copied, page); 989 + else 990 + copied = block_write_end(file, mapping, pos, 991 + len, copied, page, fsdata); 972 992 973 993 /* 974 994 * No need to use i_size_read() here, the i_size ··· 1124 1134 1125 1135 BUG_ON(!ext4_handle_valid(handle)); 1126 1136 1127 - if (copied < len) { 1128 - if (!PageUptodate(page)) 1129 - copied = 0; 1130 - page_zero_new_buffers(page, from+copied, to); 1131 - } 1137 + if (ext4_has_inline_data(inode)) 1138 + copied = ext4_write_inline_data_end(inode, pos, len, 1139 + copied, page); 1140 + else { 1141 + if (copied < len) { 1142 + if (!PageUptodate(page)) 1143 + copied = 0; 1144 + page_zero_new_buffers(page, from+copied, to); 1145 + } 1132 1146 1133 - ret = walk_page_buffers(handle, page_buffers(page), from, 1134 - to, &partial, write_end_fn); 1135 - if (!partial) 1136 - SetPageUptodate(page); 1147 + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, 1148 + to, &partial, write_end_fn); 1149 + if (!partial) 1150 + SetPageUptodate(page); 1151 + } 1137 1152 new_i_size = pos + copied; 1138 1153 if (new_i_size > inode->i_size) 1139 1154 i_size_write(inode, pos+copied); ··· 1296 1301 struct inode *inode = page->mapping->host; 1297 1302 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1298 1303 int num_clusters; 1304 + ext4_fsblk_t lblk; 1299 1305 1300 1306 head = page_buffers(page); 1301 1307 bh = head; ··· 1306 1310 if ((offset <= curr_off) && (buffer_delay(bh))) { 1307 1311 to_release++; 1308 1312 clear_buffer_delay(bh); 1309 - clear_buffer_da_mapped(bh); 1310 1313 } 1311 1314 curr_off = next_off; 1312 1315 } while ((bh = bh->b_this_page) != head); 1316 + 1317 + if (to_release) { 1318 + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1319 + ext4_es_remove_extent(inode, lblk, to_release); 1320 + } 1313 1321 1314 1322 /* If we have released all the blocks belonging to a cluster, then we 1315 1323 * need to release the reserved space for that cluster. */ 1316 1324 num_clusters = EXT4_NUM_B2C(sbi, to_release); 1317 1325 while (num_clusters > 0) { 1318 - ext4_fsblk_t lblk; 1319 1326 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + 1320 1327 ((num_clusters - 1) << sbi->s_cluster_bits); 1321 1328 if (sbi->s_cluster_ratio == 1 || 1322 - !ext4_find_delalloc_cluster(inode, lblk, 1)) 1329 + !ext4_find_delalloc_cluster(inode, lblk)) 1323 1330 ext4_da_release_space(inode, 1); 1324 1331 1325 1332 num_clusters--; ··· 1428 1429 clear_buffer_delay(bh); 1429 1430 bh->b_blocknr = pblock; 1430 1431 } 1431 - if (buffer_da_mapped(bh)) 1432 - clear_buffer_da_mapped(bh); 1433 1432 if (buffer_unwritten(bh) || 1434 1433 buffer_mapped(bh)) 1435 1434 BUG_ON(bh->b_blocknr != pblock); ··· 1497 1500 struct pagevec pvec; 1498 1501 struct inode *inode = mpd->inode; 1499 1502 struct address_space *mapping = inode->i_mapping; 1503 + ext4_lblk_t start, last; 1500 1504 1501 1505 index = mpd->first_page; 1502 1506 end = mpd->next_page - 1; 1507 + 1508 + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1509 + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1510 + ext4_es_remove_extent(inode, start, last - start + 1); 1511 + 1512 + pagevec_init(&pvec, 0); 1503 1513 while (index <= end) { 1504 1514 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1505 1515 if (nr_pages == 0) ··· 1660 1656 1661 1657 for (i = 0; i < map.m_len; i++) 1662 1658 unmap_underlying_metadata(bdev, map.m_pblk + i); 1663 - 1664 - if (ext4_should_order_data(mpd->inode)) { 1665 - err = ext4_jbd2_file_inode(handle, mpd->inode); 1666 - if (err) { 1667 - /* Only if the journal is aborted */ 1668 - mpd->retval = err; 1669 - goto submit_io; 1670 - } 1671 - } 1672 1659 } 1673 1660 1674 1661 /* ··· 1790 1795 * file system block. 1791 1796 */ 1792 1797 down_read((&EXT4_I(inode)->i_data_sem)); 1793 - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1798 + if (ext4_has_inline_data(inode)) { 1799 + /* 1800 + * We will soon create blocks for this page, and let 1801 + * us pretend as if the blocks aren't allocated yet. 1802 + * In case of clusters, we have to handle the work 1803 + * of mapping from cluster so that the reserved space 1804 + * is calculated properly. 1805 + */ 1806 + if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && 1807 + ext4_find_delalloc_cluster(inode, map->m_lblk)) 1808 + map->m_flags |= EXT4_MAP_FROM_CLUSTER; 1809 + retval = 0; 1810 + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1794 1811 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1795 1812 else 1796 1813 retval = ext4_ind_map_blocks(NULL, inode, map, 0); ··· 1820 1813 /* not enough space to reserve */ 1821 1814 goto out_unlock; 1822 1815 } 1816 + 1817 + retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); 1818 + if (retval) 1819 + goto out_unlock; 1823 1820 1824 1821 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served 1825 1822 * and it should not appear on the bh->b_state. ··· 1853 1842 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1854 1843 * initialized properly. 1855 1844 */ 1856 - static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1857 - struct buffer_head *bh, int create) 1845 + int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1846 + struct buffer_head *bh, int create) 1858 1847 { 1859 1848 struct ext4_map_blocks map; 1860 1849 int ret = 0; ··· 1928 1917 { 1929 1918 struct address_space *mapping = page->mapping; 1930 1919 struct inode *inode = mapping->host; 1931 - struct buffer_head *page_bufs; 1920 + struct buffer_head *page_bufs = NULL; 1932 1921 handle_t *handle = NULL; 1933 - int ret = 0; 1934 - int err; 1922 + int ret = 0, err = 0; 1923 + int inline_data = ext4_has_inline_data(inode); 1924 + struct buffer_head *inode_bh = NULL; 1935 1925 1936 1926 ClearPageChecked(page); 1937 - page_bufs = page_buffers(page); 1938 - BUG_ON(!page_bufs); 1939 - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 1927 + 1928 + if (inline_data) { 1929 + BUG_ON(page->index != 0); 1930 + BUG_ON(len > ext4_get_max_inline_size(inode)); 1931 + inode_bh = ext4_journalled_write_inline_data(inode, len, page); 1932 + if (inode_bh == NULL) 1933 + goto out; 1934 + } else { 1935 + page_bufs = page_buffers(page); 1936 + if (!page_bufs) { 1937 + BUG(); 1938 + goto out; 1939 + } 1940 + ext4_walk_page_buffers(handle, page_bufs, 0, len, 1941 + NULL, bget_one); 1942 + } 1940 1943 /* As soon as we unlock the page, it can go away, but we have 1941 1944 * references to buffers so we are safe */ 1942 1945 unlock_page(page); ··· 1963 1938 1964 1939 BUG_ON(!ext4_handle_valid(handle)); 1965 1940 1966 - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1967 - do_journal_get_write_access); 1941 + if (inline_data) { 1942 + ret = ext4_journal_get_write_access(handle, inode_bh); 1968 1943 1969 - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1970 - write_end_fn); 1944 + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); 1945 + 1946 + } else { 1947 + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1948 + do_journal_get_write_access); 1949 + 1950 + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1951 + write_end_fn); 1952 + } 1971 1953 if (ret == 0) 1972 1954 ret = err; 1973 1955 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; ··· 1982 1950 if (!ret) 1983 1951 ret = err; 1984 1952 1985 - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 1953 + if (!ext4_has_inline_data(inode)) 1954 + ext4_walk_page_buffers(handle, page_bufs, 0, len, 1955 + NULL, bput_one); 1986 1956 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1987 1957 out: 1958 + brelse(inode_bh); 1988 1959 return ret; 1989 1960 } 1990 1961 ··· 2064 2029 commit_write = 1; 2065 2030 } 2066 2031 page_bufs = page_buffers(page); 2067 - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2068 - ext4_bh_delay_or_unwritten)) { 2032 + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2033 + ext4_bh_delay_or_unwritten)) { 2069 2034 /* 2070 2035 * We don't want to do block allocation, so redirty 2071 2036 * the page and return. We may reach here when we do ··· 2131 2096 * mpage_da_map_and_submit to map a single contiguous memory region 2132 2097 * and then write them. 2133 2098 */ 2134 - static int write_cache_pages_da(struct address_space *mapping, 2099 + static int write_cache_pages_da(handle_t *handle, 2100 + struct address_space *mapping, 2135 2101 struct writeback_control *wbc, 2136 2102 struct mpage_da_data *mpd, 2137 2103 pgoff_t *done_index) ··· 2210 2174 2211 2175 wait_on_page_writeback(page); 2212 2176 BUG_ON(PageWriteback(page)); 2177 + 2178 + /* 2179 + * If we have inline data and arrive here, it means that 2180 + * we will soon create the block for the 1st page, so 2181 + * we'd better clear the inline data here. 2182 + */ 2183 + if (ext4_has_inline_data(inode)) { 2184 + BUG_ON(ext4_test_inode_state(inode, 2185 + EXT4_STATE_MAY_INLINE_DATA)); 2186 + ext4_destroy_inline_data(handle, inode); 2187 + } 2213 2188 2214 2189 if (mpd->next_page != page->index) 2215 2190 mpd->first_page = page->index; ··· 2428 2381 * contiguous region of logical blocks that need 2429 2382 * blocks to be allocated by ext4 and submit them. 2430 2383 */ 2431 - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 2384 + ret = write_cache_pages_da(handle, mapping, 2385 + wbc, &mpd, &done_index); 2432 2386 /* 2433 2387 * If we have a contiguous extent of pages and we 2434 2388 * haven't done the I/O yet, map the blocks and submit ··· 2493 2445 return ret; 2494 2446 } 2495 2447 2496 - #define FALL_BACK_TO_NONDELALLOC 1 2497 2448 static int ext4_nonda_switch(struct super_block *sb) 2498 2449 { 2499 2450 s64 free_blocks, dirty_blocks; ··· 2549 2502 } 2550 2503 *fsdata = (void *)0; 2551 2504 trace_ext4_da_write_begin(inode, pos, len, flags); 2505 + 2506 + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2507 + ret = ext4_da_write_inline_data_begin(mapping, inode, 2508 + pos, len, flags, 2509 + pagep, fsdata); 2510 + if (ret < 0) 2511 + goto out; 2512 + if (ret == 1) { 2513 + ret = 0; 2514 + goto out; 2515 + } 2516 + } 2517 + 2552 2518 retry: 2553 2519 /* 2554 2520 * With delayed allocation, we don't log the i_disksize update ··· 2663 2603 * changes. So let's piggyback the i_disksize mark_inode_dirty 2664 2604 * into that. 2665 2605 */ 2666 - 2667 2606 new_i_size = pos + copied; 2668 2607 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2669 - if (ext4_da_should_update_i_disksize(page, end)) { 2608 + if (ext4_has_inline_data(inode) || 2609 + ext4_da_should_update_i_disksize(page, end)) { 2670 2610 down_write(&EXT4_I(inode)->i_data_sem); 2671 - if (new_i_size > EXT4_I(inode)->i_disksize) { 2672 - /* 2673 - * Updating i_disksize when extending file 2674 - * without needing block allocation 2675 - */ 2676 - if (ext4_should_order_data(inode)) 2677 - ret = ext4_jbd2_file_inode(handle, 2678 - inode); 2679 - 2611 + if (new_i_size > EXT4_I(inode)->i_disksize) 2680 2612 EXT4_I(inode)->i_disksize = new_i_size; 2681 - } 2682 2613 up_write(&EXT4_I(inode)->i_data_sem); 2683 2614 /* We need to mark inode dirty even if 2684 2615 * new_i_size is less that inode->i_size ··· 2678 2627 ext4_mark_inode_dirty(handle, inode); 2679 2628 } 2680 2629 } 2681 - ret2 = generic_write_end(file, mapping, pos, len, copied, 2630 + 2631 + if (write_mode != CONVERT_INLINE_DATA && 2632 + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 2633 + ext4_has_inline_data(inode)) 2634 + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, 2635 + page); 2636 + else 2637 + ret2 = generic_write_end(file, mapping, pos, len, copied, 2682 2638 page, fsdata); 2639 + 2683 2640 copied = ret2; 2684 2641 if (ret2 < 0) 2685 2642 ret = ret2; ··· 2780 2721 journal_t *journal; 2781 2722 int err; 2782 2723 2724 + /* 2725 + * We can get here for an inline file via the FIBMAP ioctl 2726 + */ 2727 + if (ext4_has_inline_data(inode)) 2728 + return 0; 2729 + 2783 2730 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2784 2731 test_opt(inode->i_sb, DELALLOC)) { 2785 2732 /* ··· 2831 2766 2832 2767 static int ext4_readpage(struct file *file, struct page *page) 2833 2768 { 2769 + int ret = -EAGAIN; 2770 + struct inode *inode = page->mapping->host; 2771 + 2834 2772 trace_ext4_readpage(page); 2835 - return mpage_readpage(page, ext4_get_block); 2773 + 2774 + if (ext4_has_inline_data(inode)) 2775 + ret = ext4_readpage_inline(inode, page); 2776 + 2777 + if (ret == -EAGAIN) 2778 + return mpage_readpage(page, ext4_get_block); 2779 + 2780 + return ret; 2836 2781 } 2837 2782 2838 2783 static int 2839 2784 ext4_readpages(struct file *file, struct address_space *mapping, 2840 2785 struct list_head *pages, unsigned nr_pages) 2841 2786 { 2787 + struct inode *inode = mapping->host; 2788 + 2789 + /* If the file has inline data, no need to do readpages. */ 2790 + if (ext4_has_inline_data(inode)) 2791 + return 0; 2792 + 2842 2793 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2843 2794 } 2844 2795 ··· 2921 2840 * We allocate an uinitialized extent if blocks haven't been allocated. 2922 2841 * The extent will be converted to initialized after the IO is complete. 2923 2842 */ 2924 - static int ext4_get_block_write(struct inode *inode, sector_t iblock, 2843 + int ext4_get_block_write(struct inode *inode, sector_t iblock, 2925 2844 struct buffer_head *bh_result, int create) 2926 2845 { 2927 2846 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", ··· 2931 2850 } 2932 2851 2933 2852 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 2934 - struct buffer_head *bh_result, int flags) 2853 + struct buffer_head *bh_result, int create) 2935 2854 { 2936 - handle_t *handle = ext4_journal_current_handle(); 2937 - struct ext4_map_blocks map; 2938 - int ret = 0; 2939 - 2940 - ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n", 2941 - inode->i_ino, flags); 2942 - 2943 - flags = EXT4_GET_BLOCKS_NO_LOCK; 2944 - 2945 - map.m_lblk = iblock; 2946 - map.m_len = bh_result->b_size >> inode->i_blkbits; 2947 - 2948 - ret = ext4_map_blocks(handle, inode, &map, flags); 2949 - if (ret > 0) { 2950 - map_bh(bh_result, inode->i_sb, map.m_pblk); 2951 - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | 2952 - map.m_flags; 2953 - bh_result->b_size = inode->i_sb->s_blocksize * map.m_len; 2954 - ret = 0; 2955 - } 2956 - return ret; 2855 + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", 2856 + inode->i_ino, create); 2857 + return _ext4_get_block(inode, iblock, bh_result, 2858 + EXT4_GET_BLOCKS_NO_LOCK); 2957 2859 } 2958 2860 2959 2861 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ··· 3042 2978 * fall back to buffered IO. 3043 2979 * 3044 2980 * For holes, we fallocate those blocks, mark them as uninitialized 3045 - * If those blocks were preallocated, we mark sure they are splited, but 2981 + * If those blocks were preallocated, we mark sure they are split, but 3046 2982 * still keep the range to write as uninitialized. 3047 2983 * 3048 - * The unwrritten extents will be converted to written when DIO is completed. 2984 + * The unwritten extents will be converted to written when DIO is completed. 3049 2985 * For async direct IO, since the IO may still pending when return, we 3050 2986 * set up an end_io call back function, which will do the conversion 3051 2987 * when async direct IO completed. ··· 3063 2999 struct inode *inode = file->f_mapping->host; 3064 3000 ssize_t ret; 3065 3001 size_t count = iov_length(iov, nr_segs); 3066 - 3002 + int overwrite = 0; 3003 + get_block_t *get_block_func = NULL; 3004 + int dio_flags = 0; 3067 3005 loff_t final_size = offset + count; 3068 - if (rw == WRITE && final_size <= inode->i_size) { 3069 - int overwrite = 0; 3070 3006 3071 - BUG_ON(iocb->private == NULL); 3007 + /* Use the old path for reads and writes beyond i_size. */ 3008 + if (rw != WRITE || final_size > inode->i_size) 3009 + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3072 3010 3073 - /* If we do a overwrite dio, i_mutex locking can be released */ 3074 - overwrite = *((int *)iocb->private); 3011 + BUG_ON(iocb->private == NULL); 3075 3012 3076 - if (overwrite) { 3077 - atomic_inc(&inode->i_dio_count); 3078 - down_read(&EXT4_I(inode)->i_data_sem); 3079 - mutex_unlock(&inode->i_mutex); 3080 - } 3013 + /* If we do a overwrite dio, i_mutex locking can be released */ 3014 + overwrite = *((int *)iocb->private); 3081 3015 3082 - /* 3083 - * We could direct write to holes and fallocate. 3084 - * 3085 - * Allocated blocks to fill the hole are marked as uninitialized 3086 - * to prevent parallel buffered read to expose the stale data 3087 - * before DIO complete the data IO. 3088 - * 3089 - * As to previously fallocated extents, ext4 get_block 3090 - * will just simply mark the buffer mapped but still 3091 - * keep the extents uninitialized. 3092 - * 3093 - * for non AIO case, we will convert those unwritten extents 3094 - * to written after return back from blockdev_direct_IO. 3095 - * 3096 - * for async DIO, the conversion needs to be defered when 3097 - * the IO is completed. The ext4 end_io callback function 3098 - * will be called to take care of the conversion work. 3099 - * Here for async case, we allocate an io_end structure to 3100 - * hook to the iocb. 3101 - */ 3102 - iocb->private = NULL; 3103 - ext4_inode_aio_set(inode, NULL); 3104 - if (!is_sync_kiocb(iocb)) { 3105 - ext4_io_end_t *io_end = 3106 - ext4_init_io_end(inode, GFP_NOFS); 3107 - if (!io_end) { 3108 - ret = -ENOMEM; 3109 - goto retake_lock; 3110 - } 3111 - io_end->flag |= EXT4_IO_END_DIRECT; 3112 - iocb->private = io_end; 3113 - /* 3114 - * we save the io structure for current async 3115 - * direct IO, so that later ext4_map_blocks() 3116 - * could flag the io structure whether there 3117 - * is a unwritten extents needs to be converted 3118 - * when IO is completed. 3119 - */ 3120 - ext4_inode_aio_set(inode, io_end); 3121 - } 3122 - 3123 - if (overwrite) 3124 - ret = __blockdev_direct_IO(rw, iocb, inode, 3125 - inode->i_sb->s_bdev, iov, 3126 - offset, nr_segs, 3127 - ext4_get_block_write_nolock, 3128 - ext4_end_io_dio, 3129 - NULL, 3130 - 0); 3131 - else 3132 - ret = __blockdev_direct_IO(rw, iocb, inode, 3133 - inode->i_sb->s_bdev, iov, 3134 - offset, nr_segs, 3135 - ext4_get_block_write, 3136 - ext4_end_io_dio, 3137 - NULL, 3138 - DIO_LOCKING); 3139 - if (iocb->private) 3140 - ext4_inode_aio_set(inode, NULL); 3141 - /* 3142 - * The io_end structure takes a reference to the inode, 3143 - * that structure needs to be destroyed and the 3144 - * reference to the inode need to be dropped, when IO is 3145 - * complete, even with 0 byte write, or failed. 3146 - * 3147 - * In the successful AIO DIO case, the io_end structure will be 3148 - * desctroyed and the reference to the inode will be dropped 3149 - * after the end_io call back function is called. 3150 - * 3151 - * In the case there is 0 byte write, or error case, since 3152 - * VFS direct IO won't invoke the end_io call back function, 3153 - * we need to free the end_io structure here. 3154 - */ 3155 - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3156 - ext4_free_io_end(iocb->private); 3157 - iocb->private = NULL; 3158 - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3159 - EXT4_STATE_DIO_UNWRITTEN)) { 3160 - int err; 3161 - /* 3162 - * for non AIO case, since the IO is already 3163 - * completed, we could do the conversion right here 3164 - */ 3165 - err = ext4_convert_unwritten_extents(inode, 3166 - offset, ret); 3167 - if (err < 0) 3168 - ret = err; 3169 - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3170 - } 3171 - 3172 - retake_lock: 3173 - /* take i_mutex locking again if we do a ovewrite dio */ 3174 - if (overwrite) { 3175 - inode_dio_done(inode); 3176 - up_read(&EXT4_I(inode)->i_data_sem); 3177 - mutex_lock(&inode->i_mutex); 3178 - } 3179 - 3180 - return ret; 3016 + if (overwrite) { 3017 + atomic_inc(&inode->i_dio_count); 3018 + down_read(&EXT4_I(inode)->i_data_sem); 3019 + mutex_unlock(&inode->i_mutex); 3181 3020 } 3182 3021 3183 - /* for write the the end of file case, we fall back to old way */ 3184 - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3022 + /* 3023 + * We could direct write to holes and fallocate. 3024 + * 3025 + * Allocated blocks to fill the hole are marked as 3026 + * uninitialized to prevent parallel buffered read to expose 3027 + * the stale data before DIO complete the data IO. 3028 + * 3029 + * As to previously fallocated extents, ext4 get_block will 3030 + * just simply mark the buffer mapped but still keep the 3031 + * extents uninitialized. 3032 + * 3033 + * For non AIO case, we will convert those unwritten extents 3034 + * to written after return back from blockdev_direct_IO. 3035 + * 3036 + * For async DIO, the conversion needs to be deferred when the 3037 + * IO is completed. The ext4 end_io callback function will be 3038 + * called to take care of the conversion work. Here for async 3039 + * case, we allocate an io_end structure to hook to the iocb. 3040 + */ 3041 + iocb->private = NULL; 3042 + ext4_inode_aio_set(inode, NULL); 3043 + if (!is_sync_kiocb(iocb)) { 3044 + ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3045 + if (!io_end) { 3046 + ret = -ENOMEM; 3047 + goto retake_lock; 3048 + } 3049 + io_end->flag |= EXT4_IO_END_DIRECT; 3050 + iocb->private = io_end; 3051 + /* 3052 + * we save the io structure for current async direct 3053 + * IO, so that later ext4_map_blocks() could flag the 3054 + * io structure whether there is a unwritten extents 3055 + * needs to be converted when IO is completed. 3056 + */ 3057 + ext4_inode_aio_set(inode, io_end); 3058 + } 3059 + 3060 + if (overwrite) { 3061 + get_block_func = ext4_get_block_write_nolock; 3062 + } else { 3063 + get_block_func = ext4_get_block_write; 3064 + dio_flags = DIO_LOCKING; 3065 + } 3066 + ret = __blockdev_direct_IO(rw, iocb, inode, 3067 + inode->i_sb->s_bdev, iov, 3068 + offset, nr_segs, 3069 + get_block_func, 3070 + ext4_end_io_dio, 3071 + NULL, 3072 + dio_flags); 3073 + 3074 + if (iocb->private) 3075 + ext4_inode_aio_set(inode, NULL); 3076 + /* 3077 + * The io_end structure takes a reference to the inode, that 3078 + * structure needs to be destroyed and the reference to the 3079 + * inode need to be dropped, when IO is complete, even with 0 3080 + * byte write, or failed. 3081 + * 3082 + * In the successful AIO DIO case, the io_end structure will 3083 + * be destroyed and the reference to the inode will be dropped 3084 + * after the end_io call back function is called. 3085 + * 3086 + * In the case there is 0 byte write, or error case, since VFS 3087 + * direct IO won't invoke the end_io call back function, we 3088 + * need to free the end_io structure here. 3089 + */ 3090 + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3091 + ext4_free_io_end(iocb->private); 3092 + iocb->private = NULL; 3093 + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3094 + EXT4_STATE_DIO_UNWRITTEN)) { 3095 + int err; 3096 + /* 3097 + * for non AIO case, since the IO is already 3098 + * completed, we could do the conversion right here 3099 + */ 3100 + err = ext4_convert_unwritten_extents(inode, 3101 + offset, ret); 3102 + if (err < 0) 3103 + ret = err; 3104 + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3105 + } 3106 + 3107 + retake_lock: 3108 + /* take i_mutex locking again if we do a ovewrite dio */ 3109 + if (overwrite) { 3110 + inode_dio_done(inode); 3111 + up_read(&EXT4_I(inode)->i_data_sem); 3112 + mutex_lock(&inode->i_mutex); 3113 + } 3114 + 3115 + return ret; 3185 3116 } 3186 3117 3187 3118 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, ··· 3191 3132 * If we are doing data journalling we don't support O_DIRECT 3192 3133 */ 3193 3134 if (ext4_should_journal_data(inode)) 3135 + return 0; 3136 + 3137 + /* Let buffer I/O handle the inline data case. */ 3138 + if (ext4_has_inline_data(inode)) 3194 3139 return 0; 3195 3140 3196 3141 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); ··· 3594 3531 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3595 3532 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3596 3533 3534 + if (ext4_has_inline_data(inode)) { 3535 + int has_inline = 1; 3536 + 3537 + ext4_inline_data_truncate(inode, &has_inline); 3538 + if (has_inline) 3539 + return; 3540 + } 3541 + 3597 3542 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3598 3543 ext4_ext_truncate(inode); 3599 3544 else ··· 3827 3756 } 3828 3757 } 3829 3758 3759 + static inline void ext4_iget_extra_inode(struct inode *inode, 3760 + struct ext4_inode *raw_inode, 3761 + struct ext4_inode_info *ei) 3762 + { 3763 + __le32 *magic = (void *)raw_inode + 3764 + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 3765 + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 3766 + ext4_set_inode_state(inode, EXT4_STATE_XATTR); 3767 + ext4_find_inline_data_nolock(inode); 3768 + } else 3769 + EXT4_I(inode)->i_inline_off = 0; 3770 + } 3771 + 3830 3772 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 3831 3773 { 3832 3774 struct ext4_iloc iloc; ··· 3910 3826 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 3911 3827 3912 3828 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 3829 + ei->i_inline_off = 0; 3913 3830 ei->i_dir_start_lookup = 0; 3914 3831 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 3915 3832 /* We now have enough fields to check if the inode was active or not. ··· 3983 3898 ei->i_extra_isize = sizeof(struct ext4_inode) - 3984 3899 EXT4_GOOD_OLD_INODE_SIZE; 3985 3900 } else { 3986 - __le32 *magic = (void *)raw_inode + 3987 - EXT4_GOOD_OLD_INODE_SIZE + 3988 - ei->i_extra_isize; 3989 - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 3990 - ext4_set_inode_state(inode, EXT4_STATE_XATTR); 3901 + ext4_iget_extra_inode(inode, raw_inode, ei); 3991 3902 } 3992 3903 } 3993 3904 ··· 4006 3925 ei->i_file_acl); 4007 3926 ret = -EIO; 4008 3927 goto bad_inode; 4009 - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4010 - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4011 - (S_ISLNK(inode->i_mode) && 4012 - !ext4_inode_is_fast_symlink(inode))) 4013 - /* Validate extent which is part of inode */ 4014 - ret = ext4_ext_check_inode(inode); 4015 - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4016 - (S_ISLNK(inode->i_mode) && 4017 - !ext4_inode_is_fast_symlink(inode))) { 4018 - /* Validate block references which are part of inode */ 4019 - ret = ext4_ind_check_inode(inode); 3928 + } else if (!ext4_has_inline_data(inode)) { 3929 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3930 + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3931 + (S_ISLNK(inode->i_mode) && 3932 + !ext4_inode_is_fast_symlink(inode)))) 3933 + /* Validate extent which is part of inode */ 3934 + ret = ext4_ext_check_inode(inode); 3935 + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3936 + (S_ISLNK(inode->i_mode) && 3937 + !ext4_inode_is_fast_symlink(inode))) { 3938 + /* Validate block references which are part of inode */ 3939 + ret = ext4_ind_check_inode(inode); 3940 + } 4020 3941 } 4021 3942 if (ret) 4022 3943 goto bad_inode; ··· 4205 4122 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4206 4123 raw_inode->i_block[2] = 0; 4207 4124 } 4208 - } else 4125 + } else if (!ext4_has_inline_data(inode)) { 4209 4126 for (block = 0; block < EXT4_N_BLOCKS; block++) 4210 4127 raw_inode->i_block[block] = ei->i_data[block]; 4128 + } 4211 4129 4212 4130 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4213 4131 if (ei->i_extra_isize) { ··· 4895 4811 * journal_start/journal_stop which can block and take a long time 4896 4812 */ 4897 4813 if (page_has_buffers(page)) { 4898 - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 4899 - ext4_bh_unmapped)) { 4814 + if (!ext4_walk_page_buffers(NULL, page_buffers(page), 4815 + 0, len, NULL, 4816 + ext4_bh_unmapped)) { 4900 4817 /* Wait so that we don't change page under IO */ 4901 4818 wait_on_page_writeback(page); 4902 4819 ret = VM_FAULT_LOCKED; ··· 4918 4833 } 4919 4834 ret = __block_page_mkwrite(vma, vmf, get_block); 4920 4835 if (!ret && ext4_should_journal_data(inode)) { 4921 - if (walk_page_buffers(handle, page_buffers(page), 0, 4836 + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, 4922 4837 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 4923 4838 unlock_page(page); 4924 4839 ret = VM_FAULT_SIGBUS;
+43 -17
fs/ext4/mballoc.c
··· 1373 1373 ex->fe_start += next; 1374 1374 1375 1375 while (needed > ex->fe_len && 1376 - (buddy = mb_find_buddy(e4b, order, &max))) { 1376 + mb_find_buddy(e4b, order, &max)) { 1377 1377 1378 1378 if (block + 1 >= max) 1379 1379 break; ··· 2607 2607 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2608 2608 entry->efd_count, entry->efd_group, entry); 2609 2609 2610 - if (test_opt(sb, DISCARD)) 2611 - ext4_issue_discard(sb, entry->efd_group, 2612 - entry->efd_start_cluster, entry->efd_count); 2610 + if (test_opt(sb, DISCARD)) { 2611 + err = ext4_issue_discard(sb, entry->efd_group, 2612 + entry->efd_start_cluster, 2613 + entry->efd_count); 2614 + if (err && err != -EOPNOTSUPP) 2615 + ext4_msg(sb, KERN_WARNING, "discard request in" 2616 + " group:%d block:%d count:%d failed" 2617 + " with %d", entry->efd_group, 2618 + entry->efd_start_cluster, 2619 + entry->efd_count, err); 2620 + } 2613 2621 2614 2622 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 2615 2623 /* we expect to find existing buddy because it's pinned */ ··· 4318 4310 repeat: 4319 4311 /* allocate space in core */ 4320 4312 *errp = ext4_mb_regular_allocator(ac); 4321 - if (*errp) 4313 + if (*errp) { 4314 + ext4_discard_allocated_blocks(ac); 4322 4315 goto errout; 4316 + } 4323 4317 4324 4318 /* as we've just preallocated more space than 4325 4319 * user requested orinally, we store allocated ··· 4343 4333 ac->ac_b_ex.fe_len = 0; 4344 4334 ac->ac_status = AC_STATUS_CONTINUE; 4345 4335 goto repeat; 4346 - } else if (*errp) 4347 - errout: 4336 + } else if (*errp) { 4348 4337 ext4_discard_allocated_blocks(ac); 4349 - else { 4338 + goto errout; 4339 + } else { 4350 4340 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4351 4341 ar->len = ac->ac_b_ex.fe_len; 4352 4342 } ··· 4357 4347 *errp = -ENOSPC; 4358 4348 } 4359 4349 4350 + errout: 4360 4351 if (*errp) { 4361 4352 ac->ac_b_ex.fe_len = 0; 4362 4353 ar->len = 0; ··· 4667 4656 * with group lock held. generate_buddy look at 4668 4657 * them with group lock_held 4669 4658 */ 4670 - if (test_opt(sb, DISCARD)) 4671 - ext4_issue_discard(sb, block_group, bit, count); 4659 + if (test_opt(sb, DISCARD)) { 4660 + err = ext4_issue_discard(sb, block_group, bit, count); 4661 + if (err && err != -EOPNOTSUPP) 4662 + ext4_msg(sb, KERN_WARNING, "discard request in" 4663 + " group:%d block:%d count:%lu failed" 4664 + " with %d", block_group, bit, count, 4665 + err); 4666 + } 4667 + 4668 + 4672 4669 ext4_lock_group(sb, block_group); 4673 4670 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4674 4671 mb_free_blocks(inode, &e4b, bit, count_clusters); ··· 4870 4851 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4871 4852 * be called with under the group lock. 4872 4853 */ 4873 - static void ext4_trim_extent(struct super_block *sb, int start, int count, 4854 + static int ext4_trim_extent(struct super_block *sb, int start, int count, 4874 4855 ext4_group_t group, struct ext4_buddy *e4b) 4875 4856 { 4876 4857 struct ext4_free_extent ex; 4858 + int ret = 0; 4877 4859 4878 4860 trace_ext4_trim_extent(sb, group, start, count); 4879 4861 ··· 4890 4870 */ 4891 4871 mb_mark_used(e4b, &ex); 4892 4872 ext4_unlock_group(sb, group); 4893 - ext4_issue_discard(sb, group, start, count); 4873 + ret = ext4_issue_discard(sb, group, start, count); 4894 4874 ext4_lock_group(sb, group); 4895 4875 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4876 + return ret; 4896 4877 } 4897 4878 4898 4879 /** ··· 4922 4901 void *bitmap; 4923 4902 ext4_grpblk_t next, count = 0, free_count = 0; 4924 4903 struct ext4_buddy e4b; 4925 - int ret; 4904 + int ret = 0; 4926 4905 4927 4906 trace_ext4_trim_all_free(sb, group, start, max); 4928 4907 ··· 4949 4928 next = mb_find_next_bit(bitmap, max + 1, start); 4950 4929 4951 4930 if ((next - start) >= minblocks) { 4952 - ext4_trim_extent(sb, start, 4953 - next - start, group, &e4b); 4931 + ret = ext4_trim_extent(sb, start, 4932 + next - start, group, &e4b); 4933 + if (ret && ret != -EOPNOTSUPP) 4934 + break; 4935 + ret = 0; 4954 4936 count += next - start; 4955 4937 } 4956 4938 free_count += next - start; ··· 4974 4950 break; 4975 4951 } 4976 4952 4977 - if (!ret) 4953 + if (!ret) { 4954 + ret = count; 4978 4955 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 4956 + } 4979 4957 out: 4980 4958 ext4_unlock_group(sb, group); 4981 4959 ext4_mb_unload_buddy(&e4b); ··· 4985 4959 ext4_debug("trimmed %d blocks in the group %d\n", 4986 4960 count, group); 4987 4961 4988 - return count; 4962 + return ret; 4989 4963 } 4990 4964 4991 4965 /**
+1
fs/ext4/migrate.c
··· 14 14 15 15 #include <linux/slab.h> 16 16 #include "ext4_jbd2.h" 17 + #include "ext4_extents.h" 17 18 18 19 /* 19 20 * The contiguous blocks details which can be
+1
fs/ext4/move_extent.c
··· 18 18 #include <linux/slab.h> 19 19 #include "ext4_jbd2.h" 20 20 #include "ext4.h" 21 + #include "ext4_extents.h" 21 22 22 23 /** 23 24 * get_ext_path - Find an extent path for designated logical block number.
+356 -175
fs/ext4/namei.c
··· 202 202 struct inode *inode); 203 203 204 204 /* checksumming functions */ 205 - #define EXT4_DIRENT_TAIL(block, blocksize) \ 206 - ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ 207 - ((blocksize) - \ 208 - sizeof(struct ext4_dir_entry_tail)))) 209 - 210 - static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, 211 - unsigned int blocksize) 205 + void initialize_dirent_tail(struct ext4_dir_entry_tail *t, 206 + unsigned int blocksize) 212 207 { 213 208 memset(t, 0, sizeof(struct ext4_dir_entry_tail)); 214 209 t->det_rec_len = ext4_rec_len_to_disk( ··· 256 261 return cpu_to_le32(csum); 257 262 } 258 263 264 + static void warn_no_space_for_csum(struct inode *inode) 265 + { 266 + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " 267 + "checksum. Please run e2fsck -D.", inode->i_ino); 268 + } 269 + 259 270 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) 260 271 { 261 272 struct ext4_dir_entry_tail *t; ··· 272 271 273 272 t = get_dirent_tail(inode, dirent); 274 273 if (!t) { 275 - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 276 - "leaf for checksum. Please run e2fsck -D."); 274 + warn_no_space_for_csum(inode); 277 275 return 0; 278 276 } 279 277 ··· 294 294 295 295 t = get_dirent_tail(inode, dirent); 296 296 if (!t) { 297 - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 298 - "leaf for checksum. Please run e2fsck -D."); 297 + warn_no_space_for_csum(inode); 299 298 return; 300 299 } 301 300 ··· 302 303 (void *)t - (void *)dirent); 303 304 } 304 305 305 - static inline int ext4_handle_dirty_dirent_node(handle_t *handle, 306 - struct inode *inode, 307 - struct buffer_head *bh) 306 + int ext4_handle_dirty_dirent_node(handle_t *handle, 307 + struct inode *inode, 308 + struct buffer_head *bh) 308 309 { 309 310 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); 310 311 return ext4_handle_dirty_metadata(handle, inode, bh); ··· 376 377 count = le16_to_cpu(c->count); 377 378 if (count_offset + (limit * sizeof(struct dx_entry)) > 378 379 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 379 - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 380 - "tree checksum found. Run e2fsck -D."); 380 + warn_no_space_for_csum(inode); 381 381 return 1; 382 382 } 383 383 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); ··· 406 408 count = le16_to_cpu(c->count); 407 409 if (count_offset + (limit * sizeof(struct dx_entry)) > 408 410 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 409 - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 410 - "tree checksum. Run e2fsck -D."); 411 + warn_no_space_for_csum(inode); 411 412 return; 412 413 } 413 414 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); ··· 887 890 EXT4_DIR_REC_LEN(0)); 888 891 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 889 892 if (ext4_check_dir_entry(dir, NULL, de, bh, 893 + bh->b_data, bh->b_size, 890 894 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 891 895 + ((char *)de - bh->b_data))) { 892 896 /* On error, skip the f_pos to the next block. */ ··· 1005 1007 return (err); 1006 1008 } 1007 1009 1010 + static inline int search_dirblock(struct buffer_head *bh, 1011 + struct inode *dir, 1012 + const struct qstr *d_name, 1013 + unsigned int offset, 1014 + struct ext4_dir_entry_2 **res_dir) 1015 + { 1016 + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, 1017 + d_name, offset, res_dir); 1018 + } 1008 1019 1009 1020 /* 1010 1021 * Directory block splitting, compacting ··· 1088 1081 dx_set_count(entries, count + 1); 1089 1082 } 1090 1083 1091 - static void ext4_update_dx_flag(struct inode *inode) 1092 - { 1093 - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 1094 - EXT4_FEATURE_COMPAT_DIR_INDEX)) 1095 - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); 1096 - } 1097 - 1098 1084 /* 1099 1085 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. 1100 1086 * ··· 1107 1107 /* 1108 1108 * Returns 0 if not found, -1 on failure, and 1 on success 1109 1109 */ 1110 - static inline int search_dirblock(struct buffer_head *bh, 1111 - struct inode *dir, 1112 - const struct qstr *d_name, 1113 - unsigned int offset, 1114 - struct ext4_dir_entry_2 ** res_dir) 1110 + int search_dir(struct buffer_head *bh, 1111 + char *search_buf, 1112 + int buf_size, 1113 + struct inode *dir, 1114 + const struct qstr *d_name, 1115 + unsigned int offset, 1116 + struct ext4_dir_entry_2 **res_dir) 1115 1117 { 1116 1118 struct ext4_dir_entry_2 * de; 1117 1119 char * dlimit; ··· 1121 1119 const char *name = d_name->name; 1122 1120 int namelen = d_name->len; 1123 1121 1124 - de = (struct ext4_dir_entry_2 *) bh->b_data; 1125 - dlimit = bh->b_data + dir->i_sb->s_blocksize; 1122 + de = (struct ext4_dir_entry_2 *)search_buf; 1123 + dlimit = search_buf + buf_size; 1126 1124 while ((char *) de < dlimit) { 1127 1125 /* this code is executed quadratically often */ 1128 1126 /* do minimal checking `by hand' */ ··· 1130 1128 if ((char *) de + namelen <= dlimit && 1131 1129 ext4_match (namelen, name, de)) { 1132 1130 /* found a match - just to be sure, do a full check */ 1133 - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1131 + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, 1132 + bh->b_size, offset)) 1134 1133 return -1; 1135 1134 *res_dir = de; 1136 1135 return 1; ··· 1147 1144 return 0; 1148 1145 } 1149 1146 1147 + static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, 1148 + struct ext4_dir_entry *de) 1149 + { 1150 + struct super_block *sb = dir->i_sb; 1151 + 1152 + if (!is_dx(dir)) 1153 + return 0; 1154 + if (block == 0) 1155 + return 1; 1156 + if (de->inode == 0 && 1157 + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == 1158 + sb->s_blocksize) 1159 + return 1; 1160 + return 0; 1161 + } 1150 1162 1151 1163 /* 1152 1164 * ext4_find_entry() ··· 1176 1158 */ 1177 1159 static struct buffer_head * ext4_find_entry (struct inode *dir, 1178 1160 const struct qstr *d_name, 1179 - struct ext4_dir_entry_2 ** res_dir) 1161 + struct ext4_dir_entry_2 **res_dir, 1162 + int *inlined) 1180 1163 { 1181 1164 struct super_block *sb; 1182 1165 struct buffer_head *bh_use[NAMEI_RA_SIZE]; ··· 1198 1179 namelen = d_name->len; 1199 1180 if (namelen > EXT4_NAME_LEN) 1200 1181 return NULL; 1182 + 1183 + if (ext4_has_inline_data(dir)) { 1184 + int has_inline_data = 1; 1185 + ret = ext4_find_inline_entry(dir, d_name, res_dir, 1186 + &has_inline_data); 1187 + if (has_inline_data) { 1188 + if (inlined) 1189 + *inlined = 1; 1190 + return ret; 1191 + } 1192 + } 1193 + 1201 1194 if ((namelen <= 2) && (name[0] == '.') && 1202 1195 (name[1] == '.' || name[1] == '\0')) { 1203 1196 /* ··· 1275 1244 goto next; 1276 1245 } 1277 1246 if (!buffer_verified(bh) && 1247 + !is_dx_internal_node(dir, block, 1248 + (struct ext4_dir_entry *)bh->b_data) && 1278 1249 !ext4_dirent_csum_verify(dir, 1279 1250 (struct ext4_dir_entry *)bh->b_data)) { 1280 1251 EXT4_ERROR_INODE(dir, "checksumming directory " ··· 1394 1361 if (dentry->d_name.len > EXT4_NAME_LEN) 1395 1362 return ERR_PTR(-ENAMETOOLONG); 1396 1363 1397 - bh = ext4_find_entry(dir, &dentry->d_name, &de); 1364 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 1398 1365 inode = NULL; 1399 1366 if (bh) { 1400 1367 __u32 ino = le32_to_cpu(de->inode); ··· 1428 1395 struct ext4_dir_entry_2 * de; 1429 1396 struct buffer_head *bh; 1430 1397 1431 - bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1398 + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); 1432 1399 if (!bh) 1433 1400 return ERR_PTR(-ENOENT); 1434 1401 ino = le32_to_cpu(de->inode); ··· 1626 1593 return NULL; 1627 1594 } 1628 1595 1596 + int ext4_find_dest_de(struct inode *dir, struct inode *inode, 1597 + struct buffer_head *bh, 1598 + void *buf, int buf_size, 1599 + const char *name, int namelen, 1600 + struct ext4_dir_entry_2 **dest_de) 1601 + { 1602 + struct ext4_dir_entry_2 *de; 1603 + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); 1604 + int nlen, rlen; 1605 + unsigned int offset = 0; 1606 + char *top; 1607 + 1608 + de = (struct ext4_dir_entry_2 *)buf; 1609 + top = buf + buf_size - reclen; 1610 + while ((char *) de <= top) { 1611 + if (ext4_check_dir_entry(dir, NULL, de, bh, 1612 + buf, buf_size, offset)) 1613 + return -EIO; 1614 + if (ext4_match(namelen, name, de)) 1615 + return -EEXIST; 1616 + nlen = EXT4_DIR_REC_LEN(de->name_len); 1617 + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 1618 + if ((de->inode ? rlen - nlen : rlen) >= reclen) 1619 + break; 1620 + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1621 + offset += rlen; 1622 + } 1623 + if ((char *) de > top) 1624 + return -ENOSPC; 1625 + 1626 + *dest_de = de; 1627 + return 0; 1628 + } 1629 + 1630 + void ext4_insert_dentry(struct inode *inode, 1631 + struct ext4_dir_entry_2 *de, 1632 + int buf_size, 1633 + const char *name, int namelen) 1634 + { 1635 + 1636 + int nlen, rlen; 1637 + 1638 + nlen = EXT4_DIR_REC_LEN(de->name_len); 1639 + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 1640 + if (de->inode) { 1641 + struct ext4_dir_entry_2 *de1 = 1642 + (struct ext4_dir_entry_2 *)((char *)de + nlen); 1643 + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); 1644 + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); 1645 + de = de1; 1646 + } 1647 + de->file_type = EXT4_FT_UNKNOWN; 1648 + de->inode = cpu_to_le32(inode->i_ino); 1649 + ext4_set_de_type(inode->i_sb, de, inode->i_mode); 1650 + de->name_len = namelen; 1651 + memcpy(de->name, name, namelen); 1652 + } 1629 1653 /* 1630 1654 * Add a new entry into a directory (leaf) block. If de is non-NULL, 1631 1655 * it points to a directory entry which is guaranteed to be large ··· 1698 1608 struct inode *dir = dentry->d_parent->d_inode; 1699 1609 const char *name = dentry->d_name.name; 1700 1610 int namelen = dentry->d_name.len; 1701 - unsigned int offset = 0; 1702 1611 unsigned int blocksize = dir->i_sb->s_blocksize; 1703 1612 unsigned short reclen; 1704 - int nlen, rlen, err; 1705 - char *top; 1706 1613 int csum_size = 0; 1614 + int err; 1707 1615 1708 1616 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1709 1617 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) ··· 1709 1621 1710 1622 reclen = EXT4_DIR_REC_LEN(namelen); 1711 1623 if (!de) { 1712 - de = (struct ext4_dir_entry_2 *)bh->b_data; 1713 - top = bh->b_data + (blocksize - csum_size) - reclen; 1714 - while ((char *) de <= top) { 1715 - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1716 - return -EIO; 1717 - if (ext4_match(namelen, name, de)) 1718 - return -EEXIST; 1719 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1720 - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1721 - if ((de->inode? rlen - nlen: rlen) >= reclen) 1722 - break; 1723 - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1724 - offset += rlen; 1725 - } 1726 - if ((char *) de > top) 1727 - return -ENOSPC; 1624 + err = ext4_find_dest_de(dir, inode, 1625 + bh, bh->b_data, blocksize - csum_size, 1626 + name, namelen, &de); 1627 + if (err) 1628 + return err; 1728 1629 } 1729 1630 BUFFER_TRACE(bh, "get_write_access"); 1730 1631 err = ext4_journal_get_write_access(handle, bh); ··· 1723 1646 } 1724 1647 1725 1648 /* By now the buffer is marked for journaling */ 1726 - nlen = EXT4_DIR_REC_LEN(de->name_len); 1727 - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1728 - if (de->inode) { 1729 - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1730 - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); 1731 - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); 1732 - de = de1; 1733 - } 1734 - de->file_type = EXT4_FT_UNKNOWN; 1735 - de->inode = cpu_to_le32(inode->i_ino); 1736 - ext4_set_de_type(dir->i_sb, de, inode->i_mode); 1737 - de->name_len = namelen; 1738 - memcpy(de->name, name, namelen); 1649 + ext4_insert_dentry(inode, de, blocksize, name, namelen); 1650 + 1739 1651 /* 1740 1652 * XXX shouldn't update any times until successful 1741 1653 * completion of syscall, but too many callers depend ··· 1897 1831 blocksize = sb->s_blocksize; 1898 1832 if (!dentry->d_name.len) 1899 1833 return -EINVAL; 1834 + 1835 + if (ext4_has_inline_data(dir)) { 1836 + retval = ext4_try_add_inline_entry(handle, dentry, inode); 1837 + if (retval < 0) 1838 + return retval; 1839 + if (retval == 1) { 1840 + retval = 0; 1841 + return retval; 1842 + } 1843 + } 1844 + 1900 1845 if (is_dx(dir)) { 1901 1846 retval = ext4_dx_add_entry(handle, dentry, inode); 1902 1847 if (!retval || (retval != ERR_BAD_DX_DIR)) ··· 2113 2036 } 2114 2037 2115 2038 /* 2116 - * ext4_delete_entry deletes a directory entry by merging it with the 2117 - * previous entry 2039 + * ext4_generic_delete_entry deletes a directory entry by merging it 2040 + * with the previous entry 2118 2041 */ 2119 - static int ext4_delete_entry(handle_t *handle, 2120 - struct inode *dir, 2121 - struct ext4_dir_entry_2 *de_del, 2122 - struct buffer_head *bh) 2042 + int ext4_generic_delete_entry(handle_t *handle, 2043 + struct inode *dir, 2044 + struct ext4_dir_entry_2 *de_del, 2045 + struct buffer_head *bh, 2046 + void *entry_buf, 2047 + int buf_size, 2048 + int csum_size) 2123 2049 { 2124 2050 struct ext4_dir_entry_2 *de, *pde; 2125 2051 unsigned int blocksize = dir->i_sb->s_blocksize; 2126 - int csum_size = 0; 2127 - int i, err; 2128 - 2129 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2130 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2131 - csum_size = sizeof(struct ext4_dir_entry_tail); 2052 + int i; 2132 2053 2133 2054 i = 0; 2134 2055 pde = NULL; 2135 - de = (struct ext4_dir_entry_2 *) bh->b_data; 2136 - while (i < bh->b_size - csum_size) { 2137 - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) 2056 + de = (struct ext4_dir_entry_2 *)entry_buf; 2057 + while (i < buf_size - csum_size) { 2058 + if (ext4_check_dir_entry(dir, NULL, de, bh, 2059 + bh->b_data, bh->b_size, i)) 2138 2060 return -EIO; 2139 2061 if (de == de_del) { 2140 - BUFFER_TRACE(bh, "get_write_access"); 2141 - err = ext4_journal_get_write_access(handle, bh); 2142 - if (unlikely(err)) { 2143 - ext4_std_error(dir->i_sb, err); 2144 - return err; 2145 - } 2146 2062 if (pde) 2147 2063 pde->rec_len = ext4_rec_len_to_disk( 2148 2064 ext4_rec_len_from_disk(pde->rec_len, ··· 2146 2076 else 2147 2077 de->inode = 0; 2148 2078 dir->i_version++; 2149 - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 2150 - err = ext4_handle_dirty_dirent_node(handle, dir, bh); 2151 - if (unlikely(err)) { 2152 - ext4_std_error(dir->i_sb, err); 2153 - return err; 2154 - } 2155 2079 return 0; 2156 2080 } 2157 2081 i += ext4_rec_len_from_disk(de->rec_len, blocksize); ··· 2153 2089 de = ext4_next_entry(de, blocksize); 2154 2090 } 2155 2091 return -ENOENT; 2092 + } 2093 + 2094 + static int ext4_delete_entry(handle_t *handle, 2095 + struct inode *dir, 2096 + struct ext4_dir_entry_2 *de_del, 2097 + struct buffer_head *bh) 2098 + { 2099 + int err, csum_size = 0; 2100 + 2101 + if (ext4_has_inline_data(dir)) { 2102 + int has_inline_data = 1; 2103 + err = ext4_delete_inline_entry(handle, dir, de_del, bh, 2104 + &has_inline_data); 2105 + if (has_inline_data) 2106 + return err; 2107 + } 2108 + 2109 + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2110 + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2111 + csum_size = sizeof(struct ext4_dir_entry_tail); 2112 + 2113 + BUFFER_TRACE(bh, "get_write_access"); 2114 + err = ext4_journal_get_write_access(handle, bh); 2115 + if (unlikely(err)) 2116 + goto out; 2117 + 2118 + err = ext4_generic_delete_entry(handle, dir, de_del, 2119 + bh, bh->b_data, 2120 + dir->i_sb->s_blocksize, csum_size); 2121 + if (err) 2122 + goto out; 2123 + 2124 + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 2125 + err = ext4_handle_dirty_dirent_node(handle, dir, bh); 2126 + if (unlikely(err)) 2127 + goto out; 2128 + 2129 + return 0; 2130 + out: 2131 + if (err != -ENOENT) 2132 + ext4_std_error(dir->i_sb, err); 2133 + return err; 2156 2134 } 2157 2135 2158 2136 /* ··· 2317 2211 return err; 2318 2212 } 2319 2213 2320 - static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2214 + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2215 + struct ext4_dir_entry_2 *de, 2216 + int blocksize, int csum_size, 2217 + unsigned int parent_ino, int dotdot_real_len) 2321 2218 { 2322 - handle_t *handle; 2323 - struct inode *inode; 2219 + de->inode = cpu_to_le32(inode->i_ino); 2220 + de->name_len = 1; 2221 + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), 2222 + blocksize); 2223 + strcpy(de->name, "."); 2224 + ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2225 + 2226 + de = ext4_next_entry(de, blocksize); 2227 + de->inode = cpu_to_le32(parent_ino); 2228 + de->name_len = 2; 2229 + if (!dotdot_real_len) 2230 + de->rec_len = ext4_rec_len_to_disk(blocksize - 2231 + (csum_size + EXT4_DIR_REC_LEN(1)), 2232 + blocksize); 2233 + else 2234 + de->rec_len = ext4_rec_len_to_disk( 2235 + EXT4_DIR_REC_LEN(de->name_len), blocksize); 2236 + strcpy(de->name, ".."); 2237 + ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2238 + 2239 + return ext4_next_entry(de, blocksize); 2240 + } 2241 + 2242 + static int ext4_init_new_dir(handle_t *handle, struct inode *dir, 2243 + struct inode *inode) 2244 + { 2324 2245 struct buffer_head *dir_block = NULL; 2325 2246 struct ext4_dir_entry_2 *de; 2326 2247 struct ext4_dir_entry_tail *t; 2327 2248 unsigned int blocksize = dir->i_sb->s_blocksize; 2328 2249 int csum_size = 0; 2329 - int err, retries = 0; 2250 + int err; 2330 2251 2331 2252 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2332 2253 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2333 2254 csum_size = sizeof(struct ext4_dir_entry_tail); 2255 + 2256 + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2257 + err = ext4_try_create_inline_dir(handle, dir, inode); 2258 + if (err < 0 && err != -ENOSPC) 2259 + goto out; 2260 + if (!err) 2261 + goto out; 2262 + } 2263 + 2264 + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; 2265 + dir_block = ext4_bread(handle, inode, 0, 1, &err); 2266 + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { 2267 + if (!err) { 2268 + err = -EIO; 2269 + ext4_error(inode->i_sb, 2270 + "Directory hole detected on inode %lu\n", 2271 + inode->i_ino); 2272 + } 2273 + goto out; 2274 + } 2275 + BUFFER_TRACE(dir_block, "get_write_access"); 2276 + err = ext4_journal_get_write_access(handle, dir_block); 2277 + if (err) 2278 + goto out; 2279 + de = (struct ext4_dir_entry_2 *)dir_block->b_data; 2280 + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); 2281 + set_nlink(inode, 2); 2282 + if (csum_size) { 2283 + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); 2284 + initialize_dirent_tail(t, blocksize); 2285 + } 2286 + 2287 + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 2288 + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); 2289 + if (err) 2290 + goto out; 2291 + set_buffer_verified(dir_block); 2292 + out: 2293 + brelse(dir_block); 2294 + return err; 2295 + } 2296 + 2297 + static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2298 + { 2299 + handle_t *handle; 2300 + struct inode *inode; 2301 + int err, retries = 0; 2334 2302 2335 2303 if (EXT4_DIR_LINK_MAX(dir)) 2336 2304 return -EMLINK; ··· 2429 2249 2430 2250 inode->i_op = &ext4_dir_inode_operations; 2431 2251 inode->i_fop = &ext4_dir_operations; 2432 - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2433 - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { 2434 - if (!err) { 2435 - err = -EIO; 2436 - ext4_error(inode->i_sb, 2437 - "Directory hole detected on inode %lu\n", 2438 - inode->i_ino); 2439 - } 2440 - goto out_clear_inode; 2441 - } 2442 - BUFFER_TRACE(dir_block, "get_write_access"); 2443 - err = ext4_journal_get_write_access(handle, dir_block); 2252 + err = ext4_init_new_dir(handle, dir, inode); 2444 2253 if (err) 2445 2254 goto out_clear_inode; 2446 - de = (struct ext4_dir_entry_2 *) dir_block->b_data; 2447 - de->inode = cpu_to_le32(inode->i_ino); 2448 - de->name_len = 1; 2449 - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), 2450 - blocksize); 2451 - strcpy(de->name, "."); 2452 - ext4_set_de_type(dir->i_sb, de, S_IFDIR); 2453 - de = ext4_next_entry(de, blocksize); 2454 - de->inode = cpu_to_le32(dir->i_ino); 2455 - de->rec_len = ext4_rec_len_to_disk(blocksize - 2456 - (csum_size + EXT4_DIR_REC_LEN(1)), 2457 - blocksize); 2458 - de->name_len = 2; 2459 - strcpy(de->name, ".."); 2460 - ext4_set_de_type(dir->i_sb, de, S_IFDIR); 2461 - set_nlink(inode, 2); 2462 - 2463 - if (csum_size) { 2464 - t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); 2465 - initialize_dirent_tail(t, blocksize); 2466 - } 2467 - 2468 - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 2469 - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); 2470 - if (err) 2471 - goto out_clear_inode; 2472 - set_buffer_verified(dir_block); 2473 2255 err = ext4_mark_inode_dirty(handle, inode); 2474 2256 if (!err) 2475 2257 err = ext4_add_entry(handle, dentry, inode); ··· 2451 2309 unlock_new_inode(inode); 2452 2310 d_instantiate(dentry, inode); 2453 2311 out_stop: 2454 - brelse(dir_block); 2455 2312 ext4_journal_stop(handle); 2456 2313 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2457 2314 goto retry; ··· 2467 2326 struct ext4_dir_entry_2 *de, *de1; 2468 2327 struct super_block *sb; 2469 2328 int err = 0; 2329 + 2330 + if (ext4_has_inline_data(inode)) { 2331 + int has_inline_data = 1; 2332 + 2333 + err = empty_inline_dir(inode, &has_inline_data); 2334 + if (has_inline_data) 2335 + return err; 2336 + } 2470 2337 2471 2338 sb = inode->i_sb; 2472 2339 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || ··· 2542 2393 set_buffer_verified(bh); 2543 2394 de = (struct ext4_dir_entry_2 *) bh->b_data; 2544 2395 } 2545 - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { 2396 + if (ext4_check_dir_entry(inode, NULL, de, bh, 2397 + bh->b_data, bh->b_size, offset)) { 2546 2398 de = (struct ext4_dir_entry_2 *)(bh->b_data + 2547 2399 sb->s_blocksize); 2548 2400 offset = (offset | (sb->s_blocksize - 1)) + 1; ··· 2729 2579 return PTR_ERR(handle); 2730 2580 2731 2581 retval = -ENOENT; 2732 - bh = ext4_find_entry(dir, &dentry->d_name, &de); 2582 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2733 2583 if (!bh) 2734 2584 goto end_rmdir; 2735 2585 ··· 2794 2644 ext4_handle_sync(handle); 2795 2645 2796 2646 retval = -ENOENT; 2797 - bh = ext4_find_entry(dir, &dentry->d_name, &de); 2647 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2798 2648 if (!bh) 2799 2649 goto end_unlink; 2800 2650 ··· 2976 2826 return err; 2977 2827 } 2978 2828 2979 - #define PARENT_INO(buffer, size) \ 2980 - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) 2829 + 2830 + /* 2831 + * Try to find buffer head where contains the parent block. 2832 + * It should be the inode block if it is inlined or the 1st block 2833 + * if it is a normal dir. 2834 + */ 2835 + static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, 2836 + struct inode *inode, 2837 + int *retval, 2838 + struct ext4_dir_entry_2 **parent_de, 2839 + int *inlined) 2840 + { 2841 + struct buffer_head *bh; 2842 + 2843 + if (!ext4_has_inline_data(inode)) { 2844 + if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { 2845 + if (!*retval) { 2846 + *retval = -EIO; 2847 + ext4_error(inode->i_sb, 2848 + "Directory hole detected on inode %lu\n", 2849 + inode->i_ino); 2850 + } 2851 + return NULL; 2852 + } 2853 + *parent_de = ext4_next_entry( 2854 + (struct ext4_dir_entry_2 *)bh->b_data, 2855 + inode->i_sb->s_blocksize); 2856 + return bh; 2857 + } 2858 + 2859 + *inlined = 1; 2860 + return ext4_get_first_inline_block(inode, parent_de, retval); 2861 + } 2981 2862 2982 2863 /* 2983 2864 * Anybody can rename anything with this: the permission checks are left to the ··· 3022 2841 struct buffer_head *old_bh, *new_bh, *dir_bh; 3023 2842 struct ext4_dir_entry_2 *old_de, *new_de; 3024 2843 int retval, force_da_alloc = 0; 2844 + int inlined = 0, new_inlined = 0; 2845 + struct ext4_dir_entry_2 *parent_de; 3025 2846 3026 2847 dquot_initialize(old_dir); 3027 2848 dquot_initialize(new_dir); ··· 3043 2860 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3044 2861 ext4_handle_sync(handle); 3045 2862 3046 - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 2863 + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); 3047 2864 /* 3048 2865 * Check for inode number is _not_ due to possible IO errors. 3049 2866 * We might rmdir the source, keep it as pwd of some process ··· 3056 2873 goto end_rename; 3057 2874 3058 2875 new_inode = new_dentry->d_inode; 3059 - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); 2876 + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, 2877 + &new_de, &new_inlined); 3060 2878 if (new_bh) { 3061 2879 if (!new_inode) { 3062 2880 brelse(new_bh); ··· 3071 2887 goto end_rename; 3072 2888 } 3073 2889 retval = -EIO; 3074 - if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { 3075 - if (!retval) { 3076 - retval = -EIO; 3077 - ext4_error(old_inode->i_sb, 3078 - "Directory hole detected on inode %lu\n", 3079 - old_inode->i_ino); 3080 - } 2890 + dir_bh = ext4_get_first_dir_block(handle, old_inode, 2891 + &retval, &parent_de, 2892 + &inlined); 2893 + if (!dir_bh) 3081 2894 goto end_rename; 3082 - } 3083 - if (!buffer_verified(dir_bh) && 2895 + if (!inlined && !buffer_verified(dir_bh) && 3084 2896 !ext4_dirent_csum_verify(old_inode, 3085 2897 (struct ext4_dir_entry *)dir_bh->b_data)) 3086 2898 goto end_rename; 3087 2899 set_buffer_verified(dir_bh); 3088 - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, 3089 - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) 2900 + if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) 3090 2901 goto end_rename; 3091 2902 retval = -EMLINK; 3092 2903 if (!new_inode && new_dir != old_dir && ··· 3110 2931 ext4_current_time(new_dir); 3111 2932 ext4_mark_inode_dirty(handle, new_dir); 3112 2933 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 3113 - retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); 3114 - if (unlikely(retval)) { 3115 - ext4_std_error(new_dir->i_sb, retval); 3116 - goto end_rename; 2934 + if (!new_inlined) { 2935 + retval = ext4_handle_dirty_dirent_node(handle, 2936 + new_dir, new_bh); 2937 + if (unlikely(retval)) { 2938 + ext4_std_error(new_dir->i_sb, retval); 2939 + goto end_rename; 2940 + } 3117 2941 } 3118 2942 brelse(new_bh); 3119 2943 new_bh = NULL; ··· 3144 2962 struct buffer_head *old_bh2; 3145 2963 struct ext4_dir_entry_2 *old_de2; 3146 2964 3147 - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); 2965 + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, 2966 + &old_de2, NULL); 3148 2967 if (old_bh2) { 3149 2968 retval = ext4_delete_entry(handle, old_dir, 3150 2969 old_de2, old_bh2); ··· 3165 2982 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 3166 2983 ext4_update_dx_flag(old_dir); 3167 2984 if (dir_bh) { 3168 - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 3169 - cpu_to_le32(new_dir->i_ino); 2985 + parent_de->inode = cpu_to_le32(new_dir->i_ino); 3170 2986 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 3171 - if (is_dx(old_inode)) { 3172 - retval = ext4_handle_dirty_dx_node(handle, 3173 - old_inode, 3174 - dir_bh); 2987 + if (!inlined) { 2988 + if (is_dx(old_inode)) { 2989 + retval = ext4_handle_dirty_dx_node(handle, 2990 + old_inode, 2991 + dir_bh); 2992 + } else { 2993 + retval = ext4_handle_dirty_dirent_node(handle, 2994 + old_inode, dir_bh); 2995 + } 3175 2996 } else { 3176 - retval = ext4_handle_dirty_dirent_node(handle, 3177 - old_inode, 3178 - dir_bh); 2997 + retval = ext4_mark_inode_dirty(handle, old_inode); 3179 2998 } 3180 2999 if (retval) { 3181 3000 ext4_std_error(old_dir->i_sb, retval); ··· 3228 3043 .mknod = ext4_mknod, 3229 3044 .rename = ext4_rename, 3230 3045 .setattr = ext4_setattr, 3231 - #ifdef CONFIG_EXT4_FS_XATTR 3232 3046 .setxattr = generic_setxattr, 3233 3047 .getxattr = generic_getxattr, 3234 3048 .listxattr = ext4_listxattr, 3235 3049 .removexattr = generic_removexattr, 3236 - #endif 3237 3050 .get_acl = ext4_get_acl, 3238 3051 .fiemap = ext4_fiemap, 3239 3052 }; 3240 3053 3241 3054 const struct inode_operations ext4_special_inode_operations = { 3242 3055 .setattr = ext4_setattr, 3243 - #ifdef CONFIG_EXT4_FS_XATTR 3244 3056 .setxattr = generic_setxattr, 3245 3057 .getxattr = generic_getxattr, 3246 3058 .listxattr = ext4_listxattr, 3247 3059 .removexattr = generic_removexattr, 3248 - #endif 3249 3060 .get_acl = ext4_get_acl, 3250 3061 };
+1 -2
fs/ext4/page-io.c
··· 27 27 #include "ext4_jbd2.h" 28 28 #include "xattr.h" 29 29 #include "acl.h" 30 - #include "ext4_extents.h" 31 30 32 31 static struct kmem_cache *io_page_cachep, *io_end_cachep; 33 32 ··· 110 111 inode_dio_done(inode); 111 112 /* Wake up anyone waiting on unwritten extent conversion */ 112 113 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 113 - wake_up_all(ext4_ioend_wq(io->inode)); 114 + wake_up_all(ext4_ioend_wq(inode)); 114 115 return ret; 115 116 } 116 117
+3 -14
fs/ext4/resize.c
··· 783 783 784 784 err = ext4_journal_get_write_access(handle, gdb_bh); 785 785 if (unlikely(err)) 786 - goto exit_sbh; 786 + goto exit_dind; 787 787 788 788 err = ext4_journal_get_write_access(handle, dind); 789 789 if (unlikely(err)) ··· 792 792 /* ext4_reserve_inode_write() gets a reference on the iloc */ 793 793 err = ext4_reserve_inode_write(handle, inode, &iloc); 794 794 if (unlikely(err)) 795 - goto exit_dindj; 795 + goto exit_dind; 796 796 797 797 n_group_desc = ext4_kvmalloc((gdb_num + 1) * 798 798 sizeof(struct buffer_head *), ··· 846 846 847 847 exit_inode: 848 848 ext4_kvfree(n_group_desc); 849 - /* ext4_handle_release_buffer(handle, iloc.bh); */ 850 849 brelse(iloc.bh); 851 - exit_dindj: 852 - /* ext4_handle_release_buffer(handle, dind); */ 853 - exit_sbh: 854 - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ 855 850 exit_dind: 856 851 brelse(dind); 857 852 exit_bh: ··· 964 969 } 965 970 966 971 for (i = 0; i < reserved_gdb; i++) { 967 - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { 968 - /* 969 - int j; 970 - for (j = 0; j < i; j++) 971 - ext4_handle_release_buffer(handle, primary[j]); 972 - */ 972 + if ((err = ext4_journal_get_write_access(handle, primary[i]))) 973 973 goto exit_bh; 974 - } 975 974 } 976 975 977 976 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+31 -26
fs/ext4/super.c
··· 45 45 #include <linux/freezer.h> 46 46 47 47 #include "ext4.h" 48 - #include "ext4_extents.h" 48 + #include "ext4_extents.h" /* Needed for trace points definition */ 49 49 #include "ext4_jbd2.h" 50 50 #include "xattr.h" 51 51 #include "acl.h" ··· 939 939 return NULL; 940 940 941 941 ei->vfs_inode.i_version = 1; 942 - ei->vfs_inode.i_data.writeback_index = 0; 943 942 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 944 943 INIT_LIST_HEAD(&ei->i_prealloc_list); 945 944 spin_lock_init(&ei->i_prealloc_lock); 945 + ext4_es_init_tree(&ei->i_es_tree); 946 + rwlock_init(&ei->i_es_lock); 946 947 ei->i_reserved_data_blocks = 0; 947 948 ei->i_reserved_meta_blocks = 0; 948 949 ei->i_allocated_meta_blocks = 0; ··· 997 996 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 998 997 999 998 INIT_LIST_HEAD(&ei->i_orphan); 1000 - #ifdef CONFIG_EXT4_FS_XATTR 1001 999 init_rwsem(&ei->xattr_sem); 1002 - #endif 1003 1000 init_rwsem(&ei->i_data_sem); 1004 1001 inode_init_once(&ei->vfs_inode); 1005 1002 } ··· 1030 1031 clear_inode(inode); 1031 1032 dquot_drop(inode); 1032 1033 ext4_discard_preallocations(inode); 1034 + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 1033 1035 if (EXT4_I(inode)->jinode) { 1034 1036 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 1035 1037 EXT4_I(inode)->jinode); ··· 1447 1447 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, 1448 1448 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, 1449 1449 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, 1450 - #ifdef CONFIG_EXT4_FS_XATTR 1451 1450 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, 1452 1451 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, 1453 - #else 1454 - {Opt_user_xattr, 0, MOPT_NOSUPPORT}, 1455 - {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, 1456 - #endif 1457 1452 #ifdef CONFIG_EXT4_FS_POSIX_ACL 1458 1453 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, 1459 1454 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, ··· 3197 3202 ext4_fsblk_t overhead = 0; 3198 3203 char *buf = (char *) get_zeroed_page(GFP_KERNEL); 3199 3204 3200 - memset(buf, 0, PAGE_SIZE); 3201 3205 if (!buf) 3202 3206 return -ENOMEM; 3203 3207 ··· 3250 3256 unsigned int i; 3251 3257 int needs_recovery, has_huge_files, has_bigalloc; 3252 3258 __u64 blocks_count; 3253 - int err; 3259 + int err = 0; 3254 3260 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3255 3261 ext4_group_t first_not_zeroed; 3256 3262 ··· 3266 3272 } 3267 3273 sb->s_fs_info = sbi; 3268 3274 sbi->s_sb = sb; 3269 - sbi->s_mount_opt = 0; 3270 - sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); 3271 - sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); 3272 3275 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 3273 3276 sbi->s_sb_block = sb_block; 3274 3277 if (sb->s_bdev->bd_part) ··· 3276 3285 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3277 3286 *cp = '!'; 3278 3287 3288 + /* -EINVAL is default */ 3279 3289 ret = -EINVAL; 3280 3290 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 3281 3291 if (!blocksize) { ··· 3361 3369 if (def_mount_opts & EXT4_DEFM_UID16) 3362 3370 set_opt(sb, NO_UID32); 3363 3371 /* xattr user namespace & acls are now defaulted on */ 3364 - #ifdef CONFIG_EXT4_FS_XATTR 3365 3372 set_opt(sb, XATTR_USER); 3366 - #endif 3367 3373 #ifdef CONFIG_EXT4_FS_POSIX_ACL 3368 3374 set_opt(sb, POSIX_ACL); 3369 3375 #endif ··· 3652 3662 " too large to mount safely on this system"); 3653 3663 if (sizeof(sector_t) < 8) 3654 3664 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3655 - ret = err; 3656 3665 goto failed_mount; 3657 3666 } 3658 3667 ··· 3759 3770 } 3760 3771 if (err) { 3761 3772 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3762 - ret = err; 3763 3773 goto failed_mount3; 3764 3774 } 3765 3775 ··· 3789 3801 3790 3802 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3791 3803 mutex_init(&sbi->s_orphan_lock); 3792 - sbi->s_resize_flags = 0; 3793 3804 3794 3805 sb->s_root = NULL; 3795 3806 ··· 3884 3897 if (es->s_overhead_clusters) 3885 3898 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); 3886 3899 else { 3887 - ret = ext4_calculate_overhead(sb); 3888 - if (ret) 3900 + err = ext4_calculate_overhead(sb); 3901 + if (err) 3889 3902 goto failed_mount_wq; 3890 3903 } 3891 3904 ··· 3897 3910 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3898 3911 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3899 3912 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3913 + ret = -ENOMEM; 3900 3914 goto failed_mount_wq; 3901 3915 } 3902 3916 ··· 4000 4012 /* Enable quota usage during mount. */ 4001 4013 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && 4002 4014 !(sb->s_flags & MS_RDONLY)) { 4003 - ret = ext4_enable_quotas(sb); 4004 - if (ret) 4015 + err = ext4_enable_quotas(sb); 4016 + if (err) 4005 4017 goto failed_mount7; 4006 4018 } 4007 4019 #endif /* CONFIG_QUOTA */ 4020 + 4021 + if (test_opt(sb, DISCARD)) { 4022 + struct request_queue *q = bdev_get_queue(sb->s_bdev); 4023 + if (!blk_queue_discard(q)) 4024 + ext4_msg(sb, KERN_WARNING, 4025 + "mounting with \"discard\" option, but " 4026 + "the device does not support discard"); 4027 + } 4008 4028 4009 4029 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 4010 4030 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, ··· 4080 4084 kfree(sbi); 4081 4085 out_free_orig: 4082 4086 kfree(orig_data); 4083 - return ret; 4087 + return err ? err : ret; 4084 4088 } 4085 4089 4086 4090 /* ··· 4786 4790 4787 4791 buf->f_type = EXT4_SUPER_MAGIC; 4788 4792 buf->f_bsize = sb->s_blocksize; 4789 - buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); 4793 + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); 4790 4794 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - 4791 4795 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4792 4796 /* prevent underflow in case that few free space is available */ ··· 5278 5282 ext4_li_info = NULL; 5279 5283 mutex_init(&ext4_li_mtx); 5280 5284 5285 + /* Build-time check for flags consistency */ 5281 5286 ext4_check_flag_values(); 5282 5287 5283 5288 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { ··· 5286 5289 init_waitqueue_head(&ext4__ioend_wq[i]); 5287 5290 } 5288 5291 5289 - err = ext4_init_pageio(); 5292 + err = ext4_init_es(); 5290 5293 if (err) 5291 5294 return err; 5295 + 5296 + err = ext4_init_pageio(); 5297 + if (err) 5298 + goto out7; 5299 + 5292 5300 err = ext4_init_system_zone(); 5293 5301 if (err) 5294 5302 goto out6; ··· 5343 5341 ext4_exit_system_zone(); 5344 5342 out6: 5345 5343 ext4_exit_pageio(); 5344 + out7: 5345 + ext4_exit_es(); 5346 + 5346 5347 return err; 5347 5348 } 5348 5349
-4
fs/ext4/symlink.c
··· 35 35 .follow_link = page_follow_link_light, 36 36 .put_link = page_put_link, 37 37 .setattr = ext4_setattr, 38 - #ifdef CONFIG_EXT4_FS_XATTR 39 38 .setxattr = generic_setxattr, 40 39 .getxattr = generic_getxattr, 41 40 .listxattr = ext4_listxattr, 42 41 .removexattr = generic_removexattr, 43 - #endif 44 42 }; 45 43 46 44 const struct inode_operations ext4_fast_symlink_inode_operations = { 47 45 .readlink = generic_readlink, 48 46 .follow_link = ext4_follow_link, 49 47 .setattr = ext4_setattr, 50 - #ifdef CONFIG_EXT4_FS_XATTR 51 48 .setxattr = generic_setxattr, 52 49 .getxattr = generic_getxattr, 53 50 .listxattr = ext4_listxattr, 54 51 .removexattr = generic_removexattr, 55 - #endif 56 52 };
+69 -41
fs/ext4/xattr.c
··· 61 61 #include "xattr.h" 62 62 #include "acl.h" 63 63 64 - #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) 65 - #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) 66 - #define BFIRST(bh) ENTRY(BHDR(bh)+1) 67 - #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) 68 - 69 64 #ifdef EXT4_XATTR_DEBUG 70 65 # define ea_idebug(inode, f...) do { \ 71 66 printk(KERN_DEBUG "inode %s:%lu: ", \ ··· 307 312 return error; 308 313 } 309 314 310 - static int 315 + int 311 316 ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, 312 317 void *buffer, size_t buffer_size) 313 318 { ··· 576 581 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 577 582 } 578 583 579 - struct ext4_xattr_info { 580 - int name_index; 581 - const char *name; 582 - const void *value; 583 - size_t value_len; 584 - }; 585 - 586 - struct ext4_xattr_search { 587 - struct ext4_xattr_entry *first; 588 - void *base; 589 - void *end; 590 - struct ext4_xattr_entry *here; 591 - int not_found; 592 - }; 593 - 594 584 static int 595 585 ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) 596 586 { ··· 628 648 size. Just replace. */ 629 649 s->here->e_value_size = 630 650 cpu_to_le32(i->value_len); 631 - memset(val + size - EXT4_XATTR_PAD, 0, 632 - EXT4_XATTR_PAD); /* Clear pad bytes. */ 633 - memcpy(val, i->value, i->value_len); 651 + if (i->value == EXT4_ZERO_XATTR_VALUE) { 652 + memset(val, 0, size); 653 + } else { 654 + /* Clear pad bytes first. */ 655 + memset(val + size - EXT4_XATTR_PAD, 0, 656 + EXT4_XATTR_PAD); 657 + memcpy(val, i->value, i->value_len); 658 + } 634 659 return 0; 635 660 } 636 661 ··· 674 689 size_t size = EXT4_XATTR_SIZE(i->value_len); 675 690 void *val = s->base + min_offs - size; 676 691 s->here->e_value_offs = cpu_to_le16(min_offs - size); 677 - memset(val + size - EXT4_XATTR_PAD, 0, 678 - EXT4_XATTR_PAD); /* Clear the pad bytes. */ 679 - memcpy(val, i->value, i->value_len); 692 + if (i->value == EXT4_ZERO_XATTR_VALUE) { 693 + memset(val, 0, size); 694 + } else { 695 + /* Clear the pad bytes first. */ 696 + memset(val + size - EXT4_XATTR_PAD, 0, 697 + EXT4_XATTR_PAD); 698 + memcpy(val, i->value, i->value_len); 699 + } 680 700 } 681 701 } 682 702 return 0; ··· 784 794 int offset = (char *)s->here - bs->bh->b_data; 785 795 786 796 unlock_buffer(bs->bh); 787 - ext4_handle_release_buffer(handle, bs->bh); 788 797 if (ce) { 789 798 mb_cache_entry_release(ce); 790 799 ce = NULL; ··· 939 950 #undef header 940 951 } 941 952 942 - struct ext4_xattr_ibody_find { 943 - struct ext4_xattr_search s; 944 - struct ext4_iloc iloc; 945 - }; 946 - 947 - static int 948 - ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 949 - struct ext4_xattr_ibody_find *is) 953 + int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 954 + struct ext4_xattr_ibody_find *is) 950 955 { 951 956 struct ext4_xattr_ibody_header *header; 952 957 struct ext4_inode *raw_inode; ··· 968 985 return 0; 969 986 } 970 987 971 - static int 972 - ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 973 - struct ext4_xattr_info *i, 974 - struct ext4_xattr_ibody_find *is) 988 + int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, 989 + struct ext4_xattr_info *i, 990 + struct ext4_xattr_ibody_find *is) 991 + { 992 + struct ext4_xattr_ibody_header *header; 993 + struct ext4_xattr_search *s = &is->s; 994 + int error; 995 + 996 + if (EXT4_I(inode)->i_extra_isize == 0) 997 + return -ENOSPC; 998 + error = ext4_xattr_set_entry(i, s); 999 + if (error) { 1000 + if (error == -ENOSPC && 1001 + ext4_has_inline_data(inode)) { 1002 + error = ext4_try_to_evict_inline_data(handle, inode, 1003 + EXT4_XATTR_LEN(strlen(i->name) + 1004 + EXT4_XATTR_SIZE(i->value_len))); 1005 + if (error) 1006 + return error; 1007 + error = ext4_xattr_ibody_find(inode, i, is); 1008 + if (error) 1009 + return error; 1010 + error = ext4_xattr_set_entry(i, s); 1011 + } 1012 + if (error) 1013 + return error; 1014 + } 1015 + header = IHDR(inode, ext4_raw_inode(&is->iloc)); 1016 + if (!IS_LAST_ENTRY(s->first)) { 1017 + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 1018 + ext4_set_inode_state(inode, EXT4_STATE_XATTR); 1019 + } else { 1020 + header->h_magic = cpu_to_le32(0); 1021 + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); 1022 + } 1023 + return 0; 1024 + } 1025 + 1026 + static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 1027 + struct ext4_xattr_info *i, 1028 + struct ext4_xattr_ibody_find *is) 975 1029 { 976 1030 struct ext4_xattr_ibody_header *header; 977 1031 struct ext4_xattr_search *s = &is->s; ··· 1164 1144 { 1165 1145 handle_t *handle; 1166 1146 int error, retries = 0; 1147 + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); 1167 1148 1168 1149 retry: 1169 - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 1150 + /* 1151 + * In case of inline data, we may push out the data to a block, 1152 + * So reserve the journal space first. 1153 + */ 1154 + if (ext4_has_inline_data(inode)) 1155 + credits += ext4_writepage_trans_blocks(inode) + 1; 1156 + 1157 + handle = ext4_journal_start(inode, credits); 1170 1158 if (IS_ERR(handle)) { 1171 1159 error = PTR_ERR(handle); 1172 1160 } else {
+100 -52
fs/ext4/xattr.h
··· 21 21 #define EXT4_XATTR_INDEX_TRUSTED 4 22 22 #define EXT4_XATTR_INDEX_LUSTRE 5 23 23 #define EXT4_XATTR_INDEX_SECURITY 6 24 + #define EXT4_XATTR_INDEX_SYSTEM 7 24 25 25 26 struct ext4_xattr_header { 26 27 __le32 h_magic; /* magic number for identification */ ··· 66 65 EXT4_I(inode)->i_extra_isize)) 67 66 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 68 67 69 - # ifdef CONFIG_EXT4_FS_XATTR 68 + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) 69 + #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) 70 + #define BFIRST(bh) ENTRY(BHDR(bh)+1) 71 + #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) 72 + 73 + #define EXT4_ZERO_XATTR_VALUE ((void *)-1) 74 + 75 + struct ext4_xattr_info { 76 + int name_index; 77 + const char *name; 78 + const void *value; 79 + size_t value_len; 80 + }; 81 + 82 + struct ext4_xattr_search { 83 + struct ext4_xattr_entry *first; 84 + void *base; 85 + void *end; 86 + struct ext4_xattr_entry *here; 87 + int not_found; 88 + }; 89 + 90 + struct ext4_xattr_ibody_find { 91 + struct ext4_xattr_search s; 92 + struct ext4_iloc iloc; 93 + }; 70 94 71 95 extern const struct xattr_handler ext4_xattr_user_handler; 72 96 extern const struct xattr_handler ext4_xattr_trusted_handler; ··· 116 90 117 91 extern const struct xattr_handler *ext4_xattr_handlers[]; 118 92 119 - # else /* CONFIG_EXT4_FS_XATTR */ 93 + extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 94 + struct ext4_xattr_ibody_find *is); 95 + extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, 96 + const char *name, 97 + void *buffer, size_t buffer_size); 98 + extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, 99 + struct ext4_xattr_info *i, 100 + struct ext4_xattr_ibody_find *is); 120 101 121 - static inline int 122 - ext4_xattr_get(struct inode *inode, int name_index, const char *name, 123 - void *buffer, size_t size, int flags) 124 - { 125 - return -EOPNOTSUPP; 126 - } 102 + extern int ext4_has_inline_data(struct inode *inode); 103 + extern int ext4_get_inline_size(struct inode *inode); 104 + extern int ext4_get_max_inline_size(struct inode *inode); 105 + extern int ext4_find_inline_data_nolock(struct inode *inode); 106 + extern void ext4_write_inline_data(struct inode *inode, 107 + struct ext4_iloc *iloc, 108 + void *buffer, loff_t pos, 109 + unsigned int len); 110 + extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, 111 + unsigned int len); 112 + extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, 113 + unsigned int len); 114 + extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); 127 115 128 - static inline int 129 - ext4_xattr_set(struct inode *inode, int name_index, const char *name, 130 - const void *value, size_t size, int flags) 131 - { 132 - return -EOPNOTSUPP; 133 - } 116 + extern int ext4_readpage_inline(struct inode *inode, struct page *page); 117 + extern int ext4_try_to_write_inline_data(struct address_space *mapping, 118 + struct inode *inode, 119 + loff_t pos, unsigned len, 120 + unsigned flags, 121 + struct page **pagep); 122 + extern int ext4_write_inline_data_end(struct inode *inode, 123 + loff_t pos, unsigned len, 124 + unsigned copied, 125 + struct page *page); 126 + extern struct buffer_head * 127 + ext4_journalled_write_inline_data(struct inode *inode, 128 + unsigned len, 129 + struct page *page); 130 + extern int ext4_da_write_inline_data_begin(struct address_space *mapping, 131 + struct inode *inode, 132 + loff_t pos, unsigned len, 133 + unsigned flags, 134 + struct page **pagep, 135 + void **fsdata); 136 + extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, 137 + unsigned len, unsigned copied, 138 + struct page *page); 139 + extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, 140 + struct inode *inode); 141 + extern int ext4_try_create_inline_dir(handle_t *handle, 142 + struct inode *parent, 143 + struct inode *inode); 144 + extern int ext4_read_inline_dir(struct file *filp, 145 + void *dirent, filldir_t filldir, 146 + int *has_inline_data); 147 + extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, 148 + const struct qstr *d_name, 149 + struct ext4_dir_entry_2 **res_dir, 150 + int *has_inline_data); 151 + extern int ext4_delete_inline_entry(handle_t *handle, 152 + struct inode *dir, 153 + struct ext4_dir_entry_2 *de_del, 154 + struct buffer_head *bh, 155 + int *has_inline_data); 156 + extern int empty_inline_dir(struct inode *dir, int *has_inline_data); 157 + extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, 158 + struct ext4_dir_entry_2 **parent_de, 159 + int *retval); 160 + extern int ext4_inline_data_fiemap(struct inode *inode, 161 + struct fiemap_extent_info *fieinfo, 162 + int *has_inline); 163 + extern int ext4_try_to_evict_inline_data(handle_t *handle, 164 + struct inode *inode, 165 + int needed); 166 + extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); 134 167 135 - static inline int 136 - ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, 137 - const char *name, const void *value, size_t size, int flags) 138 - { 139 - return -EOPNOTSUPP; 140 - } 141 - 142 - static inline void 143 - ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) 144 - { 145 - } 146 - 147 - static inline void 148 - ext4_xattr_put_super(struct super_block *sb) 149 - { 150 - } 151 - 152 - static __init inline int 153 - ext4_init_xattr(void) 154 - { 155 - return 0; 156 - } 157 - 158 - static inline void 159 - ext4_exit_xattr(void) 160 - { 161 - } 162 - 163 - static inline int 164 - ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 165 - struct ext4_inode *raw_inode, handle_t *handle) 166 - { 167 - return -EOPNOTSUPP; 168 - } 169 - 170 - #define ext4_xattr_handlers NULL 171 - 172 - # endif /* CONFIG_EXT4_FS_XATTR */ 168 + extern int ext4_convert_inline_data(struct inode *inode); 173 169 174 170 #ifdef CONFIG_EXT4_FS_SECURITY 175 171 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-1
fs/jbd2/journal.c
··· 60 60 EXPORT_SYMBOL(jbd2_journal_get_undo_access); 61 61 EXPORT_SYMBOL(jbd2_journal_set_triggers); 62 62 EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 63 - EXPORT_SYMBOL(jbd2_journal_release_buffer); 64 63 EXPORT_SYMBOL(jbd2_journal_forget); 65 64 #if 0 66 65 EXPORT_SYMBOL(journal_sync_buffer);
-11
fs/jbd2/transaction.c
··· 1207 1207 return ret; 1208 1208 } 1209 1209 1210 - /* 1211 - * jbd2_journal_release_buffer: undo a get_write_access without any buffer 1212 - * updates, if the update decided in the end that it didn't need access. 1213 - * 1214 - */ 1215 - void 1216 - jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) 1217 - { 1218 - BUFFER_TRACE(bh, "entry"); 1219 - } 1220 - 1221 1210 /** 1222 1211 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. 1223 1212 * @handle: transaction handle
+7 -2
include/linux/jbd2.h
··· 1096 1096 void jbd2_journal_set_triggers(struct buffer_head *, 1097 1097 struct jbd2_buffer_trigger_type *type); 1098 1098 extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); 1099 - extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); 1100 1099 extern int jbd2_journal_forget (handle_t *, struct buffer_head *); 1101 1100 extern void journal_sync_buffer (struct buffer_head *); 1102 1101 extern void jbd2_journal_invalidatepage(journal_t *, ··· 1302 1303 1303 1304 extern int jbd_blocks_per_page(struct inode *inode); 1304 1305 1306 + /* JBD uses a CRC32 checksum */ 1307 + #define JBD_MAX_CHECKSUM_SIZE 4 1308 + 1305 1309 static inline u32 jbd2_chksum(journal_t *journal, u32 crc, 1306 1310 const void *address, unsigned int length) 1307 1311 { 1308 1312 struct { 1309 1313 struct shash_desc shash; 1310 - char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; 1314 + char ctx[JBD_MAX_CHECKSUM_SIZE]; 1311 1315 } desc; 1312 1316 int err; 1317 + 1318 + BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > 1319 + JBD_MAX_CHECKSUM_SIZE); 1313 1320 1314 1321 desc.shash.tfm = journal->j_chksum_driver; 1315 1322 desc.shash.flags = 0;
+118 -18
include/trace/events/ext4.h
··· 15 15 struct mpage_da_data; 16 16 struct ext4_map_blocks; 17 17 struct ext4_extent; 18 + struct extent_status; 18 19 19 20 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 20 21 ··· 1520 1519 ); 1521 1520 1522 1521 DECLARE_EVENT_CLASS(ext4__map_blocks_exit, 1523 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1524 - ext4_fsblk_t pblk, unsigned int len, int ret), 1522 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1525 1523 1526 - TP_ARGS(inode, lblk, pblk, len, ret), 1524 + TP_ARGS(inode, map, ret), 1527 1525 1528 1526 TP_STRUCT__entry( 1529 1527 __field( dev_t, dev ) ··· 1530 1530 __field( ext4_fsblk_t, pblk ) 1531 1531 __field( ext4_lblk_t, lblk ) 1532 1532 __field( unsigned int, len ) 1533 + __field( unsigned int, flags ) 1533 1534 __field( int, ret ) 1534 1535 ), 1535 1536 1536 1537 TP_fast_assign( 1537 1538 __entry->dev = inode->i_sb->s_dev; 1538 1539 __entry->ino = inode->i_ino; 1539 - __entry->pblk = pblk; 1540 - __entry->lblk = lblk; 1541 - __entry->len = len; 1540 + __entry->pblk = map->m_pblk; 1541 + __entry->lblk = map->m_lblk; 1542 + __entry->len = map->m_len; 1543 + __entry->flags = map->m_flags; 1542 1544 __entry->ret = ret; 1543 1545 ), 1544 1546 1545 - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", 1547 + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", 1546 1548 MAJOR(__entry->dev), MINOR(__entry->dev), 1547 1549 (unsigned long) __entry->ino, 1548 1550 __entry->lblk, __entry->pblk, 1549 - __entry->len, __entry->ret) 1551 + __entry->len, __entry->flags, __entry->ret) 1550 1552 ); 1551 1553 1552 1554 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, 1553 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1554 - ext4_fsblk_t pblk, unsigned len, int ret), 1555 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1555 1556 1556 - TP_ARGS(inode, lblk, pblk, len, ret) 1557 + TP_ARGS(inode, map, ret) 1557 1558 ); 1558 1559 1559 1560 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, 1560 - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, 1561 - ext4_fsblk_t pblk, unsigned len, int ret), 1561 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), 1562 1562 1563 - TP_ARGS(inode, lblk, pblk, len, ret) 1563 + TP_ARGS(inode, map, ret) 1564 1564 ); 1565 1565 1566 1566 TRACE_EVENT(ext4_ext_load_extent, ··· 1680 1680 ); 1681 1681 1682 1682 TRACE_EVENT(ext4_ext_handle_uninitialized_extents, 1683 - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, 1683 + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, 1684 1684 unsigned int allocated, ext4_fsblk_t newblock), 1685 1685 1686 - TP_ARGS(inode, map, allocated, newblock), 1686 + TP_ARGS(inode, map, flags, allocated, newblock), 1687 1687 1688 1688 TP_STRUCT__entry( 1689 1689 __field( dev_t, dev ) ··· 1699 1699 TP_fast_assign( 1700 1700 __entry->dev = inode->i_sb->s_dev; 1701 1701 __entry->ino = inode->i_ino; 1702 - __entry->flags = map->m_flags; 1702 + __entry->flags = flags; 1703 1703 __entry->lblk = map->m_lblk; 1704 1704 __entry->pblk = map->m_pblk; 1705 1705 __entry->len = map->m_len; ··· 1707 1707 __entry->newblk = newblock; 1708 1708 ), 1709 1709 1710 - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" 1710 + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " 1711 1711 "allocated %d newblock %llu", 1712 1712 MAJOR(__entry->dev), MINOR(__entry->dev), 1713 1713 (unsigned long) __entry->ino, ··· 2053 2053 __entry->depth, 2054 2054 (unsigned) __entry->partial, 2055 2055 (unsigned short) __entry->eh_entries) 2056 + ); 2057 + 2058 + TRACE_EVENT(ext4_es_insert_extent, 2059 + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), 2060 + 2061 + TP_ARGS(inode, start, len), 2062 + 2063 + TP_STRUCT__entry( 2064 + __field( dev_t, dev ) 2065 + __field( ino_t, ino ) 2066 + __field( loff_t, start ) 2067 + __field( loff_t, len ) 2068 + ), 2069 + 2070 + TP_fast_assign( 2071 + __entry->dev = inode->i_sb->s_dev; 2072 + __entry->ino = inode->i_ino; 2073 + __entry->start = start; 2074 + __entry->len = len; 2075 + ), 2076 + 2077 + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2078 + MAJOR(__entry->dev), MINOR(__entry->dev), 2079 + (unsigned long) __entry->ino, 2080 + __entry->start, __entry->len) 2081 + ); 2082 + 2083 + TRACE_EVENT(ext4_es_remove_extent, 2084 + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), 2085 + 2086 + TP_ARGS(inode, start, len), 2087 + 2088 + TP_STRUCT__entry( 2089 + __field( dev_t, dev ) 2090 + __field( ino_t, ino ) 2091 + __field( loff_t, start ) 2092 + __field( loff_t, len ) 2093 + ), 2094 + 2095 + TP_fast_assign( 2096 + __entry->dev = inode->i_sb->s_dev; 2097 + __entry->ino = inode->i_ino; 2098 + __entry->start = start; 2099 + __entry->len = len; 2100 + ), 2101 + 2102 + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2103 + MAJOR(__entry->dev), MINOR(__entry->dev), 2104 + (unsigned long) __entry->ino, 2105 + __entry->start, __entry->len) 2106 + ); 2107 + 2108 + TRACE_EVENT(ext4_es_find_extent_enter, 2109 + TP_PROTO(struct inode *inode, ext4_lblk_t start), 2110 + 2111 + TP_ARGS(inode, start), 2112 + 2113 + TP_STRUCT__entry( 2114 + __field( dev_t, dev ) 2115 + __field( ino_t, ino ) 2116 + __field( ext4_lblk_t, start ) 2117 + ), 2118 + 2119 + TP_fast_assign( 2120 + __entry->dev = inode->i_sb->s_dev; 2121 + __entry->ino = inode->i_ino; 2122 + __entry->start = start; 2123 + ), 2124 + 2125 + TP_printk("dev %d,%d ino %lu start %u", 2126 + MAJOR(__entry->dev), MINOR(__entry->dev), 2127 + (unsigned long) __entry->ino, __entry->start) 2128 + ); 2129 + 2130 + TRACE_EVENT(ext4_es_find_extent_exit, 2131 + TP_PROTO(struct inode *inode, struct extent_status *es, 2132 + ext4_lblk_t ret), 2133 + 2134 + TP_ARGS(inode, es, ret), 2135 + 2136 + TP_STRUCT__entry( 2137 + __field( dev_t, dev ) 2138 + __field( ino_t, ino ) 2139 + __field( ext4_lblk_t, start ) 2140 + __field( ext4_lblk_t, len ) 2141 + __field( ext4_lblk_t, ret ) 2142 + ), 2143 + 2144 + TP_fast_assign( 2145 + __entry->dev = inode->i_sb->s_dev; 2146 + __entry->ino = inode->i_ino; 2147 + __entry->start = es->start; 2148 + __entry->len = es->len; 2149 + __entry->ret = ret; 2150 + ), 2151 + 2152 + TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", 2153 + MAJOR(__entry->dev), MINOR(__entry->dev), 2154 + (unsigned long) __entry->ino, 2155 + __entry->start, __entry->len, __entry->ret) 2056 2156 ); 2057 2157 2058 2158 #endif /* _TRACE_EXT4_H */