Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+21

Documentation/filesystems/ext4.txt

··· 494 494 session_write_kbytes This file is read-only and shows the number of 495 495 kilobytes of data that have been written to this 496 496 filesystem since it was mounted. 497 + 498 + reserved_clusters This is RW file and contains number of reserved 499 + clusters in the file system which will be used 500 + in the specific situations to avoid costly 501 + zeroout, unexpected ENOSPC, or possible data 502 + loss. The default is 2% or 4096 clusters, 503 + whichever is smaller and this can be changed 504 + however it can never exceed number of clusters 505 + in the file system. If there is not enough space 506 + for the reserved space when mounting the file 507 + mount will _not_ fail. 497 508 .............................................................................. 498 509 499 510 Ioctls ··· 597 586 64 bit integer argument. The kernel allocates 598 587 bitmaps and inode table, the userspace tool thus 599 588 just passes the new number of blocks. 589 + 590 + EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes 591 + (like i_blocks, i_size, i_flags, ...) from 592 + the specified inode with inode 593 + EXT4_BOOT_LOADER_INO (#5). This is typically 594 + used to store a boot loader in a secure part of 595 + the filesystem, where it can't be changed by a 596 + normal user by accident. 597 + The data blocks of the previous boot loader 598 + will be associated with the given inode. 600 599 601 600 .............................................................................. 602 601

+5

fs/buffer.c

··· 2987 2987 /* Take care of bh's that straddle the end of the device */ 2988 2988 guard_bh_eod(rw, bio, bh); 2989 2989 2990 + if (buffer_meta(bh)) 2991 + rw |= REQ_META; 2992 + if (buffer_prio(bh)) 2993 + rw |= REQ_PRIO; 2994 + 2990 2995 bio_get(bio); 2991 2996 submit_bio(rw, bio); 2992 2997

+2 -1

fs/ext4/Kconfig

··· 71 71 Enables run-time debugging support for the ext4 filesystem. 72 72 73 73 If you select Y here, then you will be able to turn on debugging 74 - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" 74 + with a command such as: 75 + echo 1 > /sys/module/ext4/parameters/mballoc_debug

+41 -12

fs/ext4/balloc.c

··· 30 30 */ 31 31 32 32 /* 33 + * Calculate block group number for a given block number 34 + */ 35 + ext4_group_t ext4_get_group_number(struct super_block *sb, 36 + ext4_fsblk_t block) 37 + { 38 + ext4_group_t group; 39 + 40 + if (test_opt2(sb, STD_GROUP_SIZE)) 41 + group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 42 + block) >> 43 + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); 44 + else 45 + ext4_get_group_no_and_offset(sb, block, &group, NULL); 46 + return group; 47 + } 48 + 49 + /* 33 50 * Calculate the block group number and offset into the block/cluster 34 51 * allocation bitmap, given a block number 35 52 */ ··· 66 49 67 50 } 68 51 69 - static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, 70 - ext4_group_t block_group) 52 + /* 53 + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so 54 + * and 0 otherwise. 55 + */ 56 + static inline int ext4_block_in_group(struct super_block *sb, 57 + ext4_fsblk_t block, 58 + ext4_group_t block_group) 71 59 { 72 60 ext4_group_t actual_group; 73 - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); 74 - if (actual_group == block_group) 75 - return 1; 76 - return 0; 61 + 62 + actual_group = ext4_get_group_number(sb, block); 63 + return (actual_group == block_group) ? 1 : 0; 77 64 } 78 65 79 66 /* Return the number of clusters used for file system metadata; this ··· 441 420 trace_ext4_read_block_bitmap_load(sb, block_group); 442 421 bh->b_end_io = ext4_end_bitmap_read; 443 422 get_bh(bh); 444 - submit_bh(READ, bh); 423 + submit_bh(READ | REQ_META | REQ_PRIO, bh); 445 424 return bh; 446 425 verify: 447 426 ext4_validate_block_bitmap(sb, desc, block_group, bh); ··· 499 478 static int ext4_has_free_clusters(struct ext4_sb_info *sbi, 500 479 s64 nclusters, unsigned int flags) 501 480 { 502 - s64 free_clusters, dirty_clusters, root_clusters; 481 + s64 free_clusters, dirty_clusters, rsv, resv_clusters; 503 482 struct percpu_counter *fcc = &sbi->s_freeclusters_counter; 504 483 struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; 505 484 506 485 free_clusters = percpu_counter_read_positive(fcc); 507 486 dirty_clusters = percpu_counter_read_positive(dcc); 487 + resv_clusters = atomic64_read(&sbi->s_resv_clusters); 508 488 509 489 /* 510 490 * r_blocks_count should always be multiple of the cluster ratio so 511 491 * we are safe to do a plane bit shift only. 512 492 */ 513 - root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; 493 + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + 494 + resv_clusters; 514 495 515 - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < 496 + if (free_clusters - (nclusters + rsv + dirty_clusters) < 516 497 EXT4_FREECLUSTERS_WATERMARK) { 517 498 free_clusters = percpu_counter_sum_positive(fcc); 518 499 dirty_clusters = percpu_counter_sum_positive(dcc); ··· 522 499 /* Check whether we have space after accounting for current 523 500 * dirty clusters & root reserved clusters. 524 501 */ 525 - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) 502 + if (free_clusters >= (rsv + nclusters + dirty_clusters)) 526 503 return 1; 527 504 528 505 /* Hm, nope. Are (enough) root reserved clusters available? */ 529 506 if (uid_eq(sbi->s_resuid, current_fsuid()) || 530 507 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || 531 508 capable(CAP_SYS_RESOURCE) || 532 - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 509 + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 533 510 511 + if (free_clusters >= (nclusters + dirty_clusters + 512 + resv_clusters)) 513 + return 1; 514 + } 515 + /* No free blocks. Let's see if we can dip into reserved pool */ 516 + if (flags & EXT4_MB_USE_RESERVED) { 534 517 if (free_clusters >= (nclusters + dirty_clusters)) 535 518 return 1; 536 519 }

+11 -9

fs/ext4/dir.c

··· 46 46 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 47 47 EXT4_FEATURE_COMPAT_DIR_INDEX) && 48 48 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || 49 - ((inode->i_size >> sb->s_blocksize_bits) == 1))) 49 + ((inode->i_size >> sb->s_blocksize_bits) == 1) || 50 + ext4_has_inline_data(inode))) 50 51 return 1; 51 52 52 53 return 0; ··· 116 115 int ret = 0; 117 116 int dir_has_error = 0; 118 117 119 - if (ext4_has_inline_data(inode)) { 120 - int has_inline_data = 1; 121 - ret = ext4_read_inline_dir(filp, dirent, filldir, 122 - &has_inline_data); 123 - if (has_inline_data) 124 - return ret; 125 - } 126 - 127 118 if (is_dx_dir(inode)) { 128 119 err = ext4_dx_readdir(filp, dirent, filldir); 129 120 if (err != ERR_BAD_DX_DIR) { ··· 129 136 ext4_clear_inode_flag(file_inode(filp), 130 137 EXT4_INODE_INDEX); 131 138 } 139 + 140 + if (ext4_has_inline_data(inode)) { 141 + int has_inline_data = 1; 142 + ret = ext4_read_inline_dir(filp, dirent, filldir, 143 + &has_inline_data); 144 + if (has_inline_data) 145 + return ret; 146 + } 147 + 132 148 stored = 0; 133 149 offset = filp->f_pos & (sb->s_blocksize - 1); 134 150

+70 -31

fs/ext4/ext4.h

··· 121 121 #define EXT4_MB_STREAM_ALLOC 0x0800 122 122 /* Use reserved root blocks if needed */ 123 123 #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 124 + /* Use blocks from reserved pool */ 125 + #define EXT4_MB_USE_RESERVED 0x2000 124 126 125 127 struct ext4_allocation_request { 126 128 /* target inode for block we're allocating */ ··· 198 196 #define EXT4_IO_END_ERROR 0x0002 199 197 #define EXT4_IO_END_DIRECT 0x0004 200 198 201 - struct ext4_io_page { 202 - struct page *p_page; 203 - atomic_t p_count; 204 - }; 205 - 206 - #define MAX_IO_PAGES 128 207 - 208 199 /* 209 200 * For converting uninitialized extents on a work queue. 210 - * 211 - * 'page' is only used from the writepage() path; 'pages' is only used for 212 - * buffered writes; they are used to keep page references until conversion 213 - * takes place. For AIO/DIO, neither field is filled in. 214 201 */ 215 202 typedef struct ext4_io_end { 216 203 struct list_head list; /* per-file finished IO list */ ··· 209 218 ssize_t size; /* size of the extent */ 210 219 struct kiocb *iocb; /* iocb struct for AIO */ 211 220 int result; /* error value for AIO */ 212 - int num_io_pages; /* for writepages() */ 213 - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ 221 + atomic_t count; /* reference counter */ 214 222 } ext4_io_end_t; 215 223 216 224 struct ext4_io_submit { 217 225 int io_op; 218 226 struct bio *io_bio; 219 227 ext4_io_end_t *io_end; 220 - struct ext4_io_page *io_page; 221 228 sector_t io_next_block; 222 229 }; 223 230 ··· 392 403 #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 393 404 394 405 #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 395 - #define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ 406 + #define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ 396 407 397 408 /* Flags that should be inherited by new inodes from their parent. */ 398 409 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ ··· 546 557 #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 547 558 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 548 559 EXT4_GET_BLOCKS_CREATE) 549 - /* Caller is from the delayed allocation writeout path, 550 - so set the magic i_delalloc_reserve_flag after taking the 551 - inode allocation semaphore for */ 560 + /* Caller is from the delayed allocation writeout path 561 + * finally doing the actual allocation of delayed blocks */ 552 562 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 553 563 /* caller is from the direct IO path, request to creation of an 554 564 unitialized extents if not allocated, split the uninitialized ··· 559 571 /* Convert extent to initialized after IO complete */ 560 572 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 561 573 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 562 - /* Punch out blocks of an extent */ 563 - #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 574 + /* Eventual metadata allocation (due to growing extent tree) 575 + * should not fail, so try to use reserved blocks for that.*/ 576 + #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 564 577 /* Don't normalize allocation size (used for fallocate) */ 565 578 #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 566 579 /* Request will not result in inode size update (user for fallocate) */ ··· 605 616 #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 606 617 #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 607 618 #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 619 + #define EXT4_IOC_SWAP_BOOT _IO('f', 17) 608 620 609 621 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 610 622 /* ··· 939 949 #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ 940 950 941 951 /* 942 - * Mount flags 952 + * Mount flags set via mount options or defaults 943 953 */ 944 954 #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 945 955 #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ ··· 971 981 #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 972 982 #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ 973 983 984 + /* 985 + * Mount flags set either automatically (could not be set by mount option) 986 + * based on per file system feature or property or in special cases such as 987 + * distinguishing between explicit mount option definition and default. 988 + */ 974 989 #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly 975 990 specified delalloc */ 991 + #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group 992 + size of blocksize * 8 993 + blocks */ 976 994 977 995 #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 978 996 ~EXT4_MOUNT_##opt ··· 1177 1179 unsigned int s_mount_flags; 1178 1180 unsigned int s_def_mount_opt; 1179 1181 ext4_fsblk_t s_sb_block; 1182 + atomic64_t s_resv_clusters; 1180 1183 kuid_t s_resuid; 1181 1184 kgid_t s_resgid; 1182 1185 unsigned short s_mount_state; ··· 1332 1333 return ino == EXT4_ROOT_INO || 1333 1334 ino == EXT4_USR_QUOTA_INO || 1334 1335 ino == EXT4_GRP_QUOTA_INO || 1336 + ino == EXT4_BOOT_LOADER_INO || 1335 1337 ino == EXT4_JOURNAL_INO || 1336 1338 ino == EXT4_RESIZE_INO || 1337 1339 (ino >= EXT4_FIRST_INO(sb) && ··· 1374 1374 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1375 1375 nolocking */ 1376 1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1377 + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 1377 1378 }; 1378 1379 1379 1380 #define EXT4_INODE_BIT_FNS(name, field, offset) \ ··· 1785 1784 */ 1786 1785 #define ERR_BAD_DX_DIR -75000 1787 1786 1788 - void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 1789 - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); 1790 - 1791 1787 /* 1792 1788 * Timeout and state flag for lazy initialization inode thread. 1793 1789 */ ··· 1906 1908 struct buffer_head *bh); 1907 1909 1908 1910 /* balloc.c */ 1911 + extern void ext4_get_group_no_and_offset(struct super_block *sb, 1912 + ext4_fsblk_t blocknr, 1913 + ext4_group_t *blockgrpp, 1914 + ext4_grpblk_t *offsetp); 1915 + extern ext4_group_t ext4_get_group_number(struct super_block *sb, 1916 + ext4_fsblk_t block); 1917 + 1909 1918 extern void ext4_validate_block_bitmap(struct super_block *sb, 1910 1919 struct ext4_group_desc *desc, 1911 1920 unsigned int block_group, ··· 2113 2108 unsigned long nr_segs); 2114 2109 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2115 2110 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2116 - extern void ext4_ind_truncate(struct inode *inode); 2117 - extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); 2111 + extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2112 + extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2113 + ext4_lblk_t first, ext4_lblk_t stop); 2118 2114 2119 2115 /* ioctl.c */ 2120 2116 extern long ext4_ioctl(struct file *, unsigned int, unsigned long); ··· 2123 2117 2124 2118 /* migrate.c */ 2125 2119 extern int ext4_ext_migrate(struct inode *); 2120 + extern int ext4_ind_migrate(struct inode *inode); 2126 2121 2127 2122 /* namei.c */ 2128 2123 extern int ext4_dirent_csum_verify(struct inode *inode, ··· 2518 2511 extern int ext4_read_inline_dir(struct file *filp, 2519 2512 void *dirent, filldir_t filldir, 2520 2513 int *has_inline_data); 2514 + extern int htree_inlinedir_to_tree(struct file *dir_file, 2515 + struct inode *dir, ext4_lblk_t block, 2516 + struct dx_hash_info *hinfo, 2517 + __u32 start_hash, __u32 start_minor_hash, 2518 + int *has_inline_data); 2521 2519 extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, 2522 2520 const struct qstr *d_name, 2523 2521 struct ext4_dir_entry_2 **res_dir, ··· 2559 2547 extern int ext4_handle_dirty_dirent_node(handle_t *handle, 2560 2548 struct inode *inode, 2561 2549 struct buffer_head *bh); 2550 + #define S_SHIFT 12 2551 + static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { 2552 + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, 2553 + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, 2554 + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, 2555 + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, 2556 + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, 2557 + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, 2558 + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, 2559 + }; 2560 + 2561 + static inline void ext4_set_de_type(struct super_block *sb, 2562 + struct ext4_dir_entry_2 *de, 2563 + umode_t mode) { 2564 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) 2565 + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 2566 + } 2567 + 2562 2568 2563 2569 /* symlink.c */ 2564 2570 extern const struct inode_operations ext4_symlink_inode_operations; ··· 2603 2573 int chunk); 2604 2574 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2605 2575 struct ext4_map_blocks *map, int flags); 2606 - extern void ext4_ext_truncate(struct inode *); 2607 - extern int ext4_ext_punch_hole(struct file *file, loff_t offset, 2608 - loff_t length); 2576 + extern void ext4_ext_truncate(handle_t *, struct inode *); 2577 + extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2578 + ext4_lblk_t end); 2609 2579 extern void ext4_ext_init(struct super_block *); 2610 2580 extern void ext4_ext_release(struct super_block *); 2611 2581 extern long ext4_fallocate(struct file *file, int mode, loff_t offset, ··· 2639 2609 2640 2610 2641 2611 /* move_extent.c */ 2612 + extern void ext4_double_down_write_data_sem(struct inode *first, 2613 + struct inode *second); 2614 + extern void ext4_double_up_write_data_sem(struct inode *orig_inode, 2615 + struct inode *donor_inode); 2616 + void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); 2617 + void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); 2642 2618 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2643 2619 __u64 start_orig, __u64 start_donor, 2644 2620 __u64 len, __u64 *moved_len); 2645 2621 2646 2622 /* page-io.c */ 2647 2623 extern int __init ext4_init_pageio(void); 2648 - extern void ext4_add_complete_io(ext4_io_end_t *io_end); 2649 2624 extern void ext4_exit_pageio(void); 2650 2625 extern void ext4_ioend_shutdown(struct inode *); 2651 - extern void ext4_free_io_end(ext4_io_end_t *io); 2652 2626 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2627 + extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); 2628 + extern int ext4_put_io_end(ext4_io_end_t *io_end); 2629 + extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); 2630 + extern void ext4_io_submit_init(struct ext4_io_submit *io, 2631 + struct writeback_control *wbc); 2653 2632 extern void ext4_end_io_work(struct work_struct *work); 2654 2633 extern void ext4_io_submit(struct ext4_io_submit *io); 2655 2634 extern int ext4_bio_write_page(struct ext4_io_submit *io,

+5

fs/ext4/ext4_extents.h

··· 270 270 0xffff); 271 271 } 272 272 273 + #define ext4_ext_dirty(handle, inode, path) \ 274 + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 275 + int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, 276 + struct inode *inode, struct ext4_ext_path *path); 277 + 273 278 #endif /* _EXT4_EXTENTS */ 274 279

+8

fs/ext4/ext4_jbd2.c

··· 43 43 { 44 44 journal_t *journal; 45 45 46 + might_sleep(); 47 + 46 48 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 47 49 if (sb->s_flags & MS_RDONLY) 48 50 return ERR_PTR(-EROFS); ··· 114 112 handle_t *handle, struct buffer_head *bh) 115 113 { 116 114 int err = 0; 115 + 116 + might_sleep(); 117 117 118 118 if (ext4_handle_valid(handle)) { 119 119 err = jbd2_journal_get_write_access(handle, bh); ··· 213 209 { 214 210 int err = 0; 215 211 212 + might_sleep(); 213 + 214 + set_buffer_meta(bh); 215 + set_buffer_prio(bh); 216 216 if (ext4_handle_valid(handle)) { 217 217 err = jbd2_journal_dirty_metadata(handle, bh); 218 218 if (err) {

+9 -3

fs/ext4/ext4_jbd2.h

··· 29 29 * block to complete the transaction. 30 30 * 31 31 * For extents-enabled fs we may have to allocate and modify up to 32 - * 5 levels of tree + root which are stored in the inode. */ 32 + * 5 levels of tree, data block (for each of these we need bitmap + group 33 + * summaries), root which is stored in the inode, sb 34 + */ 33 35 34 36 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 35 37 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 36 - ? 27U : 8U) 38 + ? 20U : 8U) 37 39 38 40 /* Extended attribute operations touch at most two data buffers, 39 41 * two bitmap buffers, and two group summaries, in addition to the inode ··· 196 194 * ext4_journal_callback_del: delete a registered callback 197 195 * @handle: active journal transaction handle on which callback was registered 198 196 * @jce: registered journal callback entry to unregister 197 + * Return true if object was sucessfully removed 199 198 */ 200 - static inline void ext4_journal_callback_del(handle_t *handle, 199 + static inline bool ext4_journal_callback_try_del(handle_t *handle, 201 200 struct ext4_journal_cb_entry *jce) 202 201 { 202 + bool deleted; 203 203 struct ext4_sb_info *sbi = 204 204 EXT4_SB(handle->h_transaction->t_journal->j_private); 205 205 206 206 spin_lock(&sbi->s_md_lock); 207 + deleted = !list_empty(&jce->jce_list); 207 208 list_del_init(&jce->jce_list); 208 209 spin_unlock(&sbi->s_md_lock); 210 + return deleted; 209 211 } 210 212 211 213 int

+199 -327

fs/ext4/extents.c

··· 157 157 * - ENOMEM 158 158 * - EIO 159 159 */ 160 - #define ext4_ext_dirty(handle, inode, path) \ 161 - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) 162 - static int __ext4_ext_dirty(const char *where, unsigned int line, 163 - handle_t *handle, struct inode *inode, 164 - struct ext4_ext_path *path) 160 + int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, 161 + struct inode *inode, struct ext4_ext_path *path) 165 162 { 166 163 int err; 167 164 if (path->p_bh) { ··· 1810 1813 } 1811 1814 depth = ext_depth(inode); 1812 1815 ex = path[depth].p_ext; 1816 + eh = path[depth].p_hdr; 1813 1817 if (unlikely(path[depth].p_hdr == NULL)) { 1814 1818 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1815 1819 return -EIO; 1816 1820 } 1817 1821 1818 1822 /* try to insert block into found extent and return */ 1819 - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1820 - && ext4_can_extents_be_merged(inode, ex, newext)) { 1821 - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", 1822 - ext4_ext_is_uninitialized(newext), 1823 - ext4_ext_get_actual_len(newext), 1824 - le32_to_cpu(ex->ee_block), 1825 - ext4_ext_is_uninitialized(ex), 1826 - ext4_ext_get_actual_len(ex), 1827 - ext4_ext_pblock(ex)); 1828 - err = ext4_ext_get_access(handle, inode, path + depth); 1829 - if (err) 1830 - return err; 1823 + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { 1831 1824 1832 1825 /* 1833 - * ext4_can_extents_be_merged should have checked that either 1834 - * both extents are uninitialized, or both aren't. Thus we 1835 - * need to check only one of them here. 1826 + * Try to see whether we should rather test the extent on 1827 + * right from ex, or from the left of ex. This is because 1828 + * ext4_ext_find_extent() can return either extent on the 1829 + * left, or on the right from the searched position. This 1830 + * will make merging more effective. 1836 1831 */ 1837 - if (ext4_ext_is_uninitialized(ex)) 1838 - uninitialized = 1; 1839 - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1832 + if (ex < EXT_LAST_EXTENT(eh) && 1833 + (le32_to_cpu(ex->ee_block) + 1834 + ext4_ext_get_actual_len(ex) < 1835 + le32_to_cpu(newext->ee_block))) { 1836 + ex += 1; 1837 + goto prepend; 1838 + } else if ((ex > EXT_FIRST_EXTENT(eh)) && 1839 + (le32_to_cpu(newext->ee_block) + 1840 + ext4_ext_get_actual_len(newext) < 1841 + le32_to_cpu(ex->ee_block))) 1842 + ex -= 1; 1843 + 1844 + /* Try to append newex to the ex */ 1845 + if (ext4_can_extents_be_merged(inode, ex, newext)) { 1846 + ext_debug("append [%d]%d block to %u:[%d]%d" 1847 + "(from %llu)\n", 1848 + ext4_ext_is_uninitialized(newext), 1849 + ext4_ext_get_actual_len(newext), 1850 + le32_to_cpu(ex->ee_block), 1851 + ext4_ext_is_uninitialized(ex), 1852 + ext4_ext_get_actual_len(ex), 1853 + ext4_ext_pblock(ex)); 1854 + err = ext4_ext_get_access(handle, inode, 1855 + path + depth); 1856 + if (err) 1857 + return err; 1858 + 1859 + /* 1860 + * ext4_can_extents_be_merged should have checked 1861 + * that either both extents are uninitialized, or 1862 + * both aren't. Thus we need to check only one of 1863 + * them here. 1864 + */ 1865 + if (ext4_ext_is_uninitialized(ex)) 1866 + uninitialized = 1; 1867 + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1840 1868 + ext4_ext_get_actual_len(newext)); 1841 - if (uninitialized) 1842 - ext4_ext_mark_uninitialized(ex); 1843 - eh = path[depth].p_hdr; 1844 - nearex = ex; 1845 - goto merge; 1869 + if (uninitialized) 1870 + ext4_ext_mark_uninitialized(ex); 1871 + eh = path[depth].p_hdr; 1872 + nearex = ex; 1873 + goto merge; 1874 + } 1875 + 1876 + prepend: 1877 + /* Try to prepend newex to the ex */ 1878 + if (ext4_can_extents_be_merged(inode, newext, ex)) { 1879 + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" 1880 + "(from %llu)\n", 1881 + le32_to_cpu(newext->ee_block), 1882 + ext4_ext_is_uninitialized(newext), 1883 + ext4_ext_get_actual_len(newext), 1884 + le32_to_cpu(ex->ee_block), 1885 + ext4_ext_is_uninitialized(ex), 1886 + ext4_ext_get_actual_len(ex), 1887 + ext4_ext_pblock(ex)); 1888 + err = ext4_ext_get_access(handle, inode, 1889 + path + depth); 1890 + if (err) 1891 + return err; 1892 + 1893 + /* 1894 + * ext4_can_extents_be_merged should have checked 1895 + * that either both extents are uninitialized, or 1896 + * both aren't. Thus we need to check only one of 1897 + * them here. 1898 + */ 1899 + if (ext4_ext_is_uninitialized(ex)) 1900 + uninitialized = 1; 1901 + ex->ee_block = newext->ee_block; 1902 + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1903 + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1904 + + ext4_ext_get_actual_len(newext)); 1905 + if (uninitialized) 1906 + ext4_ext_mark_uninitialized(ex); 1907 + eh = path[depth].p_hdr; 1908 + nearex = ex; 1909 + goto merge; 1910 + } 1846 1911 } 1847 1912 1848 1913 depth = ext_depth(inode); ··· 1939 1880 * There is no free space in the found leaf. 1940 1881 * We're gonna add a new leaf in the tree. 1941 1882 */ 1942 - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) 1943 - flags = EXT4_MB_USE_ROOT_BLOCKS; 1883 + if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) 1884 + flags = EXT4_MB_USE_RESERVED; 1944 1885 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 1945 1886 if (err) 1946 1887 goto cleanup; ··· 2658 2599 return 1; 2659 2600 } 2660 2601 2661 - static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2662 - ext4_lblk_t end) 2602 + int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2603 + ext4_lblk_t end) 2663 2604 { 2664 2605 struct super_block *sb = inode->i_sb; 2665 2606 int depth = ext_depth(inode); ··· 2726 2667 2727 2668 /* 2728 2669 * Split the extent in two so that 'end' is the last 2729 - * block in the first new extent 2670 + * block in the first new extent. Also we should not 2671 + * fail removing space due to ENOSPC so try to use 2672 + * reserved block if that happens. 2730 2673 */ 2731 2674 err = ext4_split_extent_at(handle, inode, path, 2732 - end + 1, split_flag, 2733 - EXT4_GET_BLOCKS_PRE_IO | 2734 - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); 2675 + end + 1, split_flag, 2676 + EXT4_GET_BLOCKS_PRE_IO | 2677 + EXT4_GET_BLOCKS_METADATA_NOFAIL); 2735 2678 2736 2679 if (err < 0) 2737 2680 goto out; ··· 3208 3147 static int ext4_ext_convert_to_initialized(handle_t *handle, 3209 3148 struct inode *inode, 3210 3149 struct ext4_map_blocks *map, 3211 - struct ext4_ext_path *path) 3150 + struct ext4_ext_path *path, 3151 + int flags) 3212 3152 { 3213 3153 struct ext4_sb_info *sbi; 3214 3154 struct ext4_extent_header *eh; 3215 3155 struct ext4_map_blocks split_map; 3216 3156 struct ext4_extent zero_ex; 3217 - struct ext4_extent *ex; 3157 + struct ext4_extent *ex, *abut_ex; 3218 3158 ext4_lblk_t ee_block, eof_block; 3219 - unsigned int ee_len, depth; 3220 - int allocated, max_zeroout = 0; 3159 + unsigned int ee_len, depth, map_len = map->m_len; 3160 + int allocated = 0, max_zeroout = 0; 3221 3161 int err = 0; 3222 3162 int split_flag = 0; 3223 3163 3224 3164 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3225 3165 "block %llu, max_blocks %u\n", inode->i_ino, 3226 - (unsigned long long)map->m_lblk, map->m_len); 3166 + (unsigned long long)map->m_lblk, map_len); 3227 3167 3228 3168 sbi = EXT4_SB(inode->i_sb); 3229 3169 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3230 3170 inode->i_sb->s_blocksize_bits; 3231 - if (eof_block < map->m_lblk + map->m_len) 3232 - eof_block = map->m_lblk + map->m_len; 3171 + if (eof_block < map->m_lblk + map_len) 3172 + eof_block = map->m_lblk + map_len; 3233 3173 3234 3174 depth = ext_depth(inode); 3235 3175 eh = path[depth].p_hdr; 3236 3176 ex = path[depth].p_ext; 3237 3177 ee_block = le32_to_cpu(ex->ee_block); 3238 3178 ee_len = ext4_ext_get_actual_len(ex); 3239 - allocated = ee_len - (map->m_lblk - ee_block); 3240 3179 zero_ex.ee_len = 0; 3241 3180 3242 3181 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); ··· 3247 3186 3248 3187 /* 3249 3188 * Attempt to transfer newly initialized blocks from the currently 3250 - * uninitialized extent to its left neighbor. This is much cheaper 3189 + * uninitialized extent to its neighbor. This is much cheaper 3251 3190 * than an insertion followed by a merge as those involve costly 3252 - * memmove() calls. This is the common case in steady state for 3253 - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append 3254 - * writes. 3191 + * memmove() calls. Transferring to the left is the common case in 3192 + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) 3193 + * followed by append writes. 3255 3194 * 3256 3195 * Limitations of the current logic: 3257 - * - L1: we only deal with writes at the start of the extent. 3258 - * The approach could be extended to writes at the end 3259 - * of the extent but this scenario was deemed less common. 3260 - * - L2: we do not deal with writes covering the whole extent. 3196 + * - L1: we do not deal with writes covering the whole extent. 3261 3197 * This would require removing the extent if the transfer 3262 3198 * is possible. 3263 - * - L3: we only attempt to merge with an extent stored in the 3199 + * - L2: we only attempt to merge with an extent stored in the 3264 3200 * same extent tree node. 3265 3201 */ 3266 - if ((map->m_lblk == ee_block) && /*L1*/ 3267 - (map->m_len < ee_len) && /*L2*/ 3268 - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ 3269 - struct ext4_extent *prev_ex; 3202 + if ((map->m_lblk == ee_block) && 3203 + /* See if we can merge left */ 3204 + (map_len < ee_len) && /*L1*/ 3205 + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ 3270 3206 ext4_lblk_t prev_lblk; 3271 3207 ext4_fsblk_t prev_pblk, ee_pblk; 3272 - unsigned int prev_len, write_len; 3208 + unsigned int prev_len; 3273 3209 3274 - prev_ex = ex - 1; 3275 - prev_lblk = le32_to_cpu(prev_ex->ee_block); 3276 - prev_len = ext4_ext_get_actual_len(prev_ex); 3277 - prev_pblk = ext4_ext_pblock(prev_ex); 3210 + abut_ex = ex - 1; 3211 + prev_lblk = le32_to_cpu(abut_ex->ee_block); 3212 + prev_len = ext4_ext_get_actual_len(abut_ex); 3213 + prev_pblk = ext4_ext_pblock(abut_ex); 3278 3214 ee_pblk = ext4_ext_pblock(ex); 3279 - write_len = map->m_len; 3280 3215 3281 3216 /* 3282 - * A transfer of blocks from 'ex' to 'prev_ex' is allowed 3217 + * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3283 3218 * upon those conditions: 3284 - * - C1: prev_ex is initialized, 3285 - * - C2: prev_ex is logically abutting ex, 3286 - * - C3: prev_ex is physically abutting ex, 3287 - * - C4: prev_ex can receive the additional blocks without 3219 + * - C1: abut_ex is initialized, 3220 + * - C2: abut_ex is logically abutting ex, 3221 + * - C3: abut_ex is physically abutting ex, 3222 + * - C4: abut_ex can receive the additional blocks without 3288 3223 * overflowing the (initialized) length limit. 3289 3224 */ 3290 - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ 3225 + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ 3291 3226 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3292 3227 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3293 - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ 3228 + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3294 3229 err = ext4_ext_get_access(handle, inode, path + depth); 3295 3230 if (err) 3296 3231 goto out; 3297 3232 3298 3233 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3299 - map, ex, prev_ex); 3234 + map, ex, abut_ex); 3300 3235 3301 - /* Shift the start of ex by 'write_len' blocks */ 3302 - ex->ee_block = cpu_to_le32(ee_block + write_len); 3303 - ext4_ext_store_pblock(ex, ee_pblk + write_len); 3304 - ex->ee_len = cpu_to_le16(ee_len - write_len); 3236 + /* Shift the start of ex by 'map_len' blocks */ 3237 + ex->ee_block = cpu_to_le32(ee_block + map_len); 3238 + ext4_ext_store_pblock(ex, ee_pblk + map_len); 3239 + ex->ee_len = cpu_to_le16(ee_len - map_len); 3305 3240 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3306 3241 3307 - /* Extend prev_ex by 'write_len' blocks */ 3308 - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); 3309 - 3310 - /* Mark the block containing both extents as dirty */ 3311 - ext4_ext_dirty(handle, inode, path + depth); 3312 - 3313 - /* Update path to point to the right extent */ 3314 - path[depth].p_ext = prev_ex; 3242 + /* Extend abut_ex by 'map_len' blocks */ 3243 + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 3315 3244 3316 3245 /* Result: number of initialized blocks past m_lblk */ 3317 - allocated = write_len; 3318 - goto out; 3246 + allocated = map_len; 3247 + } 3248 + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && 3249 + (map_len < ee_len) && /*L1*/ 3250 + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ 3251 + /* See if we can merge right */ 3252 + ext4_lblk_t next_lblk; 3253 + ext4_fsblk_t next_pblk, ee_pblk; 3254 + unsigned int next_len; 3255 + 3256 + abut_ex = ex + 1; 3257 + next_lblk = le32_to_cpu(abut_ex->ee_block); 3258 + next_len = ext4_ext_get_actual_len(abut_ex); 3259 + next_pblk = ext4_ext_pblock(abut_ex); 3260 + ee_pblk = ext4_ext_pblock(ex); 3261 + 3262 + /* 3263 + * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3264 + * upon those conditions: 3265 + * - C1: abut_ex is initialized, 3266 + * - C2: abut_ex is logically abutting ex, 3267 + * - C3: abut_ex is physically abutting ex, 3268 + * - C4: abut_ex can receive the additional blocks without 3269 + * overflowing the (initialized) length limit. 3270 + */ 3271 + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ 3272 + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ 3273 + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ 3274 + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3275 + err = ext4_ext_get_access(handle, inode, path + depth); 3276 + if (err) 3277 + goto out; 3278 + 3279 + trace_ext4_ext_convert_to_initialized_fastpath(inode, 3280 + map, ex, abut_ex); 3281 + 3282 + /* Shift the start of abut_ex by 'map_len' blocks */ 3283 + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 3284 + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); 3285 + ex->ee_len = cpu_to_le16(ee_len - map_len); 3286 + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3287 + 3288 + /* Extend abut_ex by 'map_len' blocks */ 3289 + abut_ex->ee_len = cpu_to_le16(next_len + map_len); 3290 + 3291 + /* Result: number of initialized blocks past m_lblk */ 3292 + allocated = map_len; 3319 3293 } 3320 3294 } 3295 + if (allocated) { 3296 + /* Mark the block containing both extents as dirty */ 3297 + ext4_ext_dirty(handle, inode, path + depth); 3298 + 3299 + /* Update path to point to the right extent */ 3300 + path[depth].p_ext = abut_ex; 3301 + goto out; 3302 + } else 3303 + allocated = ee_len - (map->m_lblk - ee_block); 3321 3304 3322 3305 WARN_ON(map->m_lblk < ee_block); 3323 3306 /* ··· 3435 3330 } 3436 3331 3437 3332 allocated = ext4_split_extent(handle, inode, path, 3438 - &split_map, split_flag, 0); 3333 + &split_map, split_flag, flags); 3439 3334 if (allocated < 0) 3440 3335 err = allocated; 3441 3336 ··· 3755 3650 flags, allocated); 3756 3651 ext4_ext_show_leaf(inode, path); 3757 3652 3653 + /* 3654 + * When writing into uninitialized space, we should not fail to 3655 + * allocate metadata blocks for the new extent block if needed. 3656 + */ 3657 + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; 3658 + 3758 3659 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3759 3660 allocated, newblock); 3760 3661 ··· 3824 3713 } 3825 3714 3826 3715 /* buffered write, writepage time, convert*/ 3827 - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3716 + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); 3828 3717 if (ret >= 0) 3829 3718 ext4_update_inode_fsync_trans(handle, inode, 1); 3830 3719 out: ··· 4368 4257 return err ? err : allocated; 4369 4258 } 4370 4259 4371 - void ext4_ext_truncate(struct inode *inode) 4260 + void ext4_ext_truncate(handle_t *handle, struct inode *inode) 4372 4261 { 4373 - struct address_space *mapping = inode->i_mapping; 4374 4262 struct super_block *sb = inode->i_sb; 4375 4263 ext4_lblk_t last_block; 4376 - handle_t *handle; 4377 - loff_t page_len; 4378 4264 int err = 0; 4379 - 4380 - /* 4381 - * finish any pending end_io work so we won't run the risk of 4382 - * converting any truncated blocks to initialized later 4383 - */ 4384 - ext4_flush_unwritten_io(inode); 4385 - 4386 - /* 4387 - * probably first extent we're gonna free will be last in block 4388 - */ 4389 - err = ext4_writepage_trans_blocks(inode); 4390 - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); 4391 - if (IS_ERR(handle)) 4392 - return; 4393 - 4394 - if (inode->i_size % PAGE_CACHE_SIZE != 0) { 4395 - page_len = PAGE_CACHE_SIZE - 4396 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 4397 - 4398 - err = ext4_discard_partial_page_buffers(handle, 4399 - mapping, inode->i_size, page_len, 0); 4400 - 4401 - if (err) 4402 - goto out_stop; 4403 - } 4404 - 4405 - if (ext4_orphan_add(handle, inode)) 4406 - goto out_stop; 4407 - 4408 - down_write(&EXT4_I(inode)->i_data_sem); 4409 - 4410 - ext4_discard_preallocations(inode); 4411 4265 4412 4266 /* 4413 4267 * TODO: optimization is possible here. ··· 4389 4313 err = ext4_es_remove_extent(inode, last_block, 4390 4314 EXT_MAX_BLOCKS - last_block); 4391 4315 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4392 - 4393 - /* In a multi-transaction truncate, we only make the final 4394 - * transaction synchronous. 4395 - */ 4396 - if (IS_SYNC(inode)) 4397 - ext4_handle_sync(handle); 4398 - 4399 - up_write(&EXT4_I(inode)->i_data_sem); 4400 - 4401 - out_stop: 4402 - /* 4403 - * If this was a simple ftruncate() and the file will remain alive, 4404 - * then we need to clear up the orphan record which we created above. 4405 - * However, if this was a real unlink then we were called by 4406 - * ext4_delete_inode(), and we allow that function to clean up the 4407 - * orphan info for us. 4408 - */ 4409 - if (inode->i_nlink) 4410 - ext4_orphan_del(handle, inode); 4411 - 4412 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4413 - ext4_mark_inode_dirty(handle, inode); 4414 - ext4_journal_stop(handle); 4415 4316 } 4416 4317 4417 4318 static void ext4_falloc_update_inode(struct inode *inode, ··· 4674 4621 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4675 4622 length, flags); 4676 4623 return (error < 0 ? error : 0); 4677 - } 4678 - 4679 - /* 4680 - * ext4_ext_punch_hole 4681 - * 4682 - * Punches a hole of "length" bytes in a file starting 4683 - * at byte "offset" 4684 - * 4685 - * @inode: The inode of the file to punch a hole in 4686 - * @offset: The starting byte offset of the hole 4687 - * @length: The length of the hole 4688 - * 4689 - * Returns the number of blocks removed or negative on err 4690 - */ 4691 - int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) 4692 - { 4693 - struct inode *inode = file_inode(file); 4694 - struct super_block *sb = inode->i_sb; 4695 - ext4_lblk_t first_block, stop_block; 4696 - struct address_space *mapping = inode->i_mapping; 4697 - handle_t *handle; 4698 - loff_t first_page, last_page, page_len; 4699 - loff_t first_page_offset, last_page_offset; 4700 - int credits, err = 0; 4701 - 4702 - /* 4703 - * Write out all dirty pages to avoid race conditions 4704 - * Then release them. 4705 - */ 4706 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4707 - err = filemap_write_and_wait_range(mapping, 4708 - offset, offset + length - 1); 4709 - 4710 - if (err) 4711 - return err; 4712 - } 4713 - 4714 - mutex_lock(&inode->i_mutex); 4715 - /* It's not possible punch hole on append only file */ 4716 - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 4717 - err = -EPERM; 4718 - goto out_mutex; 4719 - } 4720 - if (IS_SWAPFILE(inode)) { 4721 - err = -ETXTBSY; 4722 - goto out_mutex; 4723 - } 4724 - 4725 - /* No need to punch hole beyond i_size */ 4726 - if (offset >= inode->i_size) 4727 - goto out_mutex; 4728 - 4729 - /* 4730 - * If the hole extends beyond i_size, set the hole 4731 - * to end after the page that contains i_size 4732 - */ 4733 - if (offset + length > inode->i_size) { 4734 - length = inode->i_size + 4735 - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - 4736 - offset; 4737 - } 4738 - 4739 - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4740 - last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4741 - 4742 - first_page_offset = first_page << PAGE_CACHE_SHIFT; 4743 - last_page_offset = last_page << PAGE_CACHE_SHIFT; 4744 - 4745 - /* Now release the pages */ 4746 - if (last_page_offset > first_page_offset) { 4747 - truncate_pagecache_range(inode, first_page_offset, 4748 - last_page_offset - 1); 4749 - } 4750 - 4751 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 4752 - ext4_inode_block_unlocked_dio(inode); 4753 - err = ext4_flush_unwritten_io(inode); 4754 - if (err) 4755 - goto out_dio; 4756 - inode_dio_wait(inode); 4757 - 4758 - credits = ext4_writepage_trans_blocks(inode); 4759 - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4760 - if (IS_ERR(handle)) { 4761 - err = PTR_ERR(handle); 4762 - goto out_dio; 4763 - } 4764 - 4765 - 4766 - /* 4767 - * Now we need to zero out the non-page-aligned data in the 4768 - * pages at the start and tail of the hole, and unmap the buffer 4769 - * heads for the block aligned regions of the page that were 4770 - * completely zeroed. 4771 - */ 4772 - if (first_page > last_page) { 4773 - /* 4774 - * If the file space being truncated is contained within a page 4775 - * just zero out and unmap the middle of that page 4776 - */ 4777 - err = ext4_discard_partial_page_buffers(handle, 4778 - mapping, offset, length, 0); 4779 - 4780 - if (err) 4781 - goto out; 4782 - } else { 4783 - /* 4784 - * zero out and unmap the partial page that contains 4785 - * the start of the hole 4786 - */ 4787 - page_len = first_page_offset - offset; 4788 - if (page_len > 0) { 4789 - err = ext4_discard_partial_page_buffers(handle, mapping, 4790 - offset, page_len, 0); 4791 - if (err) 4792 - goto out; 4793 - } 4794 - 4795 - /* 4796 - * zero out and unmap the partial page that contains 4797 - * the end of the hole 4798 - */ 4799 - page_len = offset + length - last_page_offset; 4800 - if (page_len > 0) { 4801 - err = ext4_discard_partial_page_buffers(handle, mapping, 4802 - last_page_offset, page_len, 0); 4803 - if (err) 4804 - goto out; 4805 - } 4806 - } 4807 - 4808 - /* 4809 - * If i_size is contained in the last page, we need to 4810 - * unmap and zero the partial page after i_size 4811 - */ 4812 - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && 4813 - inode->i_size % PAGE_CACHE_SIZE != 0) { 4814 - 4815 - page_len = PAGE_CACHE_SIZE - 4816 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 4817 - 4818 - if (page_len > 0) { 4819 - err = ext4_discard_partial_page_buffers(handle, 4820 - mapping, inode->i_size, page_len, 0); 4821 - 4822 - if (err) 4823 - goto out; 4824 - } 4825 - } 4826 - 4827 - first_block = (offset + sb->s_blocksize - 1) >> 4828 - EXT4_BLOCK_SIZE_BITS(sb); 4829 - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4830 - 4831 - /* If there are no blocks to remove, return now */ 4832 - if (first_block >= stop_block) 4833 - goto out; 4834 - 4835 - down_write(&EXT4_I(inode)->i_data_sem); 4836 - ext4_discard_preallocations(inode); 4837 - 4838 - err = ext4_es_remove_extent(inode, first_block, 4839 - stop_block - first_block); 4840 - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); 4841 - 4842 - ext4_discard_preallocations(inode); 4843 - 4844 - if (IS_SYNC(inode)) 4845 - ext4_handle_sync(handle); 4846 - 4847 - up_write(&EXT4_I(inode)->i_data_sem); 4848 - 4849 - out: 4850 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4851 - ext4_mark_inode_dirty(handle, inode); 4852 - ext4_journal_stop(handle); 4853 - out_dio: 4854 - ext4_inode_resume_unlocked_dio(inode); 4855 - out_mutex: 4856 - mutex_unlock(&inode->i_mutex); 4857 - return err; 4858 4624 } 4859 4625 4860 4626 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,

+1 -2

fs/ext4/fsync.c

··· 166 166 if (journal->j_flags & JBD2_BARRIER && 167 167 !jbd2_trans_will_send_data_barrier(journal, commit_tid)) 168 168 needs_barrier = true; 169 - jbd2_log_start_commit(journal, commit_tid); 170 - ret = jbd2_log_wait_commit(journal, commit_tid); 169 + ret = jbd2_complete_transaction(journal, commit_tid); 171 170 if (needs_barrier) { 172 171 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 173 172 if (!ret)

+50 -38

fs/ext4/ialloc.c

··· 166 166 trace_ext4_load_inode_bitmap(sb, block_group); 167 167 bh->b_end_io = ext4_end_bitmap_read; 168 168 get_bh(bh); 169 - submit_bh(READ, bh); 169 + submit_bh(READ | REQ_META | REQ_PRIO, bh); 170 170 wait_on_buffer(bh); 171 171 if (!buffer_uptodate(bh)) { 172 172 put_bh(bh); ··· 666 666 ei = EXT4_I(inode); 667 667 sbi = EXT4_SB(sb); 668 668 669 + /* 670 + * Initalize owners and quota early so that we don't have to account 671 + * for quota initialization worst case in standard inode creating 672 + * transaction 673 + */ 674 + if (owner) { 675 + inode->i_mode = mode; 676 + i_uid_write(inode, owner[0]); 677 + i_gid_write(inode, owner[1]); 678 + } else if (test_opt(sb, GRPID)) { 679 + inode->i_mode = mode; 680 + inode->i_uid = current_fsuid(); 681 + inode->i_gid = dir->i_gid; 682 + } else 683 + inode_init_owner(inode, dir, mode); 684 + dquot_initialize(inode); 685 + 669 686 if (!goal) 670 687 goal = sbi->s_inode_goal; 671 688 ··· 714 697 715 698 gdp = ext4_get_group_desc(sb, group, &group_desc_bh); 716 699 if (!gdp) 717 - goto fail; 700 + goto out; 718 701 719 702 /* 720 703 * Check free inodes count before loading bitmap. ··· 728 711 brelse(inode_bitmap_bh); 729 712 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 730 713 if (!inode_bitmap_bh) 731 - goto fail; 714 + goto out; 732 715 733 716 repeat_in_this_group: 734 717 ino = ext4_find_next_zero_bit((unsigned long *) ··· 750 733 handle_type, nblocks); 751 734 if (IS_ERR(handle)) { 752 735 err = PTR_ERR(handle); 753 - goto fail; 736 + ext4_std_error(sb, err); 737 + goto out; 754 738 } 755 739 } 756 740 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 757 741 err = ext4_journal_get_write_access(handle, inode_bitmap_bh); 758 - if (err) 759 - goto fail; 742 + if (err) { 743 + ext4_std_error(sb, err); 744 + goto out; 745 + } 760 746 ext4_lock_group(sb, group); 761 747 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); 762 748 ext4_unlock_group(sb, group); ··· 775 755 got: 776 756 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); 777 757 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); 778 - if (err) 779 - goto fail; 758 + if (err) { 759 + ext4_std_error(sb, err); 760 + goto out; 761 + } 780 762 781 763 /* We may have to initialize the block bitmap if it isn't already */ 782 764 if (ext4_has_group_desc_csum(sb) && ··· 790 768 err = ext4_journal_get_write_access(handle, block_bitmap_bh); 791 769 if (err) { 792 770 brelse(block_bitmap_bh); 793 - goto fail; 771 + ext4_std_error(sb, err); 772 + goto out; 794 773 } 795 774 796 775 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); ··· 810 787 ext4_unlock_group(sb, group); 811 788 brelse(block_bitmap_bh); 812 789 813 - if (err) 814 - goto fail; 790 + if (err) { 791 + ext4_std_error(sb, err); 792 + goto out; 793 + } 815 794 } 816 795 817 796 BUFFER_TRACE(group_desc_bh, "get_write_access"); 818 797 err = ext4_journal_get_write_access(handle, group_desc_bh); 819 - if (err) 820 - goto fail; 798 + if (err) { 799 + ext4_std_error(sb, err); 800 + goto out; 801 + } 821 802 822 803 /* Update the relevant bg descriptor fields */ 823 804 if (ext4_has_group_desc_csum(sb)) { ··· 867 840 868 841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 869 842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 870 - if (err) 871 - goto fail; 843 + if (err) { 844 + ext4_std_error(sb, err); 845 + goto out; 846 + } 872 847 873 848 percpu_counter_dec(&sbi->s_freeinodes_counter); 874 849 if (S_ISDIR(mode)) ··· 880 851 flex_group = ext4_flex_group(sbi, group); 881 852 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 882 853 } 883 - if (owner) { 884 - inode->i_mode = mode; 885 - i_uid_write(inode, owner[0]); 886 - i_gid_write(inode, owner[1]); 887 - } else if (test_opt(sb, GRPID)) { 888 - inode->i_mode = mode; 889 - inode->i_uid = current_fsuid(); 890 - inode->i_gid = dir->i_gid; 891 - } else 892 - inode_init_owner(inode, dir, mode); 893 854 894 855 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 895 856 /* This is the optimal IO size (for stat), not the fs block size */ ··· 908 889 * twice. 909 890 */ 910 891 err = -EIO; 911 - goto fail; 892 + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", 893 + inode->i_ino); 894 + goto out; 912 895 } 913 896 spin_lock(&sbi->s_next_gen_lock); 914 897 inode->i_generation = sbi->s_next_generation++; ··· 920 899 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 921 900 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 922 901 __u32 csum; 923 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 924 902 __le32 inum = cpu_to_le32(inode->i_ino); 925 903 __le32 gen = cpu_to_le32(inode->i_generation); 926 904 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, ··· 938 918 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 939 919 940 920 ret = inode; 941 - dquot_initialize(inode); 942 921 err = dquot_alloc_inode(inode); 943 922 if (err) 944 923 goto fail_drop; ··· 971 952 972 953 ext4_debug("allocating inode %lu\n", inode->i_ino); 973 954 trace_ext4_allocate_inode(inode, dir, mode); 974 - goto really_out; 975 - fail: 976 - ext4_std_error(sb, err); 977 - out: 978 - iput(inode); 979 - ret = ERR_PTR(err); 980 - really_out: 981 955 brelse(inode_bitmap_bh); 982 956 return ret; 983 957 984 958 fail_free_drop: 985 959 dquot_free_inode(inode); 986 - 987 960 fail_drop: 988 - dquot_drop(inode); 989 - inode->i_flags |= S_NOQUOTA; 990 961 clear_nlink(inode); 991 962 unlock_new_inode(inode); 963 + out: 964 + dquot_drop(inode); 965 + inode->i_flags |= S_NOQUOTA; 992 966 iput(inode); 993 967 brelse(inode_bitmap_bh); 994 968 return ERR_PTR(err);

+52 -421

fs/ext4/indirect.c

··· 292 292 } 293 293 294 294 /** 295 - * ext4_alloc_blocks: multiple allocate blocks needed for a branch 296 - * @handle: handle for this transaction 297 - * @inode: inode which needs allocated blocks 298 - * @iblock: the logical block to start allocated at 299 - * @goal: preferred physical block of allocation 300 - * @indirect_blks: the number of blocks need to allocate for indirect 301 - * blocks 302 - * @blks: number of desired blocks 303 - * @new_blocks: on return it will store the new block numbers for 304 - * the indirect blocks(if needed) and the first direct block, 305 - * @err: on return it will store the error code 306 - * 307 - * This function will return the number of blocks allocated as 308 - * requested by the passed-in parameters. 309 - */ 310 - static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 311 - ext4_lblk_t iblock, ext4_fsblk_t goal, 312 - int indirect_blks, int blks, 313 - ext4_fsblk_t new_blocks[4], int *err) 314 - { 315 - struct ext4_allocation_request ar; 316 - int target, i; 317 - unsigned long count = 0, blk_allocated = 0; 318 - int index = 0; 319 - ext4_fsblk_t current_block = 0; 320 - int ret = 0; 321 - 322 - /* 323 - * Here we try to allocate the requested multiple blocks at once, 324 - * on a best-effort basis. 325 - * To build a branch, we should allocate blocks for 326 - * the indirect blocks(if not allocated yet), and at least 327 - * the first direct block of this branch. That's the 328 - * minimum number of blocks need to allocate(required) 329 - */ 330 - /* first we try to allocate the indirect blocks */ 331 - target = indirect_blks; 332 - while (target > 0) { 333 - count = target; 334 - /* allocating blocks for indirect blocks and direct blocks */ 335 - current_block = ext4_new_meta_blocks(handle, inode, goal, 336 - 0, &count, err); 337 - if (*err) 338 - goto failed_out; 339 - 340 - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { 341 - EXT4_ERROR_INODE(inode, 342 - "current_block %llu + count %lu > %d!", 343 - current_block, count, 344 - EXT4_MAX_BLOCK_FILE_PHYS); 345 - *err = -EIO; 346 - goto failed_out; 347 - } 348 - 349 - target -= count; 350 - /* allocate blocks for indirect blocks */ 351 - while (index < indirect_blks && count) { 352 - new_blocks[index++] = current_block++; 353 - count--; 354 - } 355 - if (count > 0) { 356 - /* 357 - * save the new block number 358 - * for the first direct block 359 - */ 360 - new_blocks[index] = current_block; 361 - WARN(1, KERN_INFO "%s returned more blocks than " 362 - "requested\n", __func__); 363 - break; 364 - } 365 - } 366 - 367 - target = blks - count ; 368 - blk_allocated = count; 369 - if (!target) 370 - goto allocated; 371 - /* Now allocate data blocks */ 372 - memset(&ar, 0, sizeof(ar)); 373 - ar.inode = inode; 374 - ar.goal = goal; 375 - ar.len = target; 376 - ar.logical = iblock; 377 - if (S_ISREG(inode->i_mode)) 378 - /* enable in-core preallocation only for regular files */ 379 - ar.flags = EXT4_MB_HINT_DATA; 380 - 381 - current_block = ext4_mb_new_blocks(handle, &ar, err); 382 - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { 383 - EXT4_ERROR_INODE(inode, 384 - "current_block %llu + ar.len %d > %d!", 385 - current_block, ar.len, 386 - EXT4_MAX_BLOCK_FILE_PHYS); 387 - *err = -EIO; 388 - goto failed_out; 389 - } 390 - 391 - if (*err && (target == blks)) { 392 - /* 393 - * if the allocation failed and we didn't allocate 394 - * any blocks before 395 - */ 396 - goto failed_out; 397 - } 398 - if (!*err) { 399 - if (target == blks) { 400 - /* 401 - * save the new block number 402 - * for the first direct block 403 - */ 404 - new_blocks[index] = current_block; 405 - } 406 - blk_allocated += ar.len; 407 - } 408 - allocated: 409 - /* total number of blocks allocated for direct blocks */ 410 - ret = blk_allocated; 411 - *err = 0; 412 - return ret; 413 - failed_out: 414 - for (i = 0; i < index; i++) 415 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 416 - return ret; 417 - } 418 - 419 - /** 420 295 * ext4_alloc_branch - allocate and set up a chain of blocks. 421 296 * @handle: handle for this transaction 422 297 * @inode: owner ··· 323 448 int *blks, ext4_fsblk_t goal, 324 449 ext4_lblk_t *offsets, Indirect *branch) 325 450 { 326 - int blocksize = inode->i_sb->s_blocksize; 327 - int i, n = 0; 328 - int err = 0; 329 - struct buffer_head *bh; 330 - int num; 331 - ext4_fsblk_t new_blocks[4]; 332 - ext4_fsblk_t current_block; 451 + struct ext4_allocation_request ar; 452 + struct buffer_head * bh; 453 + ext4_fsblk_t b, new_blocks[4]; 454 + __le32 *p; 455 + int i, j, err, len = 1; 333 456 334 - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 335 - *blks, new_blocks, &err); 336 - if (err) 337 - return err; 338 - 339 - branch[0].key = cpu_to_le32(new_blocks[0]); 340 457 /* 341 - * metadata blocks and data blocks are allocated. 458 + * Set up for the direct block allocation 342 459 */ 343 - for (n = 1; n <= indirect_blks; n++) { 344 - /* 345 - * Get buffer_head for parent block, zero it out 346 - * and set the pointer to new one, then send 347 - * parent to disk. 348 - */ 349 - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 460 + memset(&ar, 0, sizeof(ar)); 461 + ar.inode = inode; 462 + ar.len = *blks; 463 + ar.logical = iblock; 464 + if (S_ISREG(inode->i_mode)) 465 + ar.flags = EXT4_MB_HINT_DATA; 466 + 467 + for (i = 0; i <= indirect_blks; i++) { 468 + if (i == indirect_blks) { 469 + ar.goal = goal; 470 + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); 471 + } else 472 + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, 473 + goal, 0, NULL, &err); 474 + if (err) { 475 + i--; 476 + goto failed; 477 + } 478 + branch[i].key = cpu_to_le32(new_blocks[i]); 479 + if (i == 0) 480 + continue; 481 + 482 + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); 350 483 if (unlikely(!bh)) { 351 484 err = -ENOMEM; 352 485 goto failed; 353 486 } 354 - 355 - branch[n].bh = bh; 356 487 lock_buffer(bh); 357 488 BUFFER_TRACE(bh, "call get_create_access"); 358 489 err = ext4_journal_get_create_access(handle, bh); 359 490 if (err) { 360 - /* Don't brelse(bh) here; it's done in 361 - * ext4_journal_forget() below */ 362 491 unlock_buffer(bh); 363 492 goto failed; 364 493 } 365 494 366 - memset(bh->b_data, 0, blocksize); 367 - branch[n].p = (__le32 *) bh->b_data + offsets[n]; 368 - branch[n].key = cpu_to_le32(new_blocks[n]); 369 - *branch[n].p = branch[n].key; 370 - if (n == indirect_blks) { 371 - current_block = new_blocks[n]; 372 - /* 373 - * End of chain, update the last new metablock of 374 - * the chain to point to the new allocated 375 - * data blocks numbers 376 - */ 377 - for (i = 1; i < num; i++) 378 - *(branch[n].p + i) = cpu_to_le32(++current_block); 379 - } 495 + memset(bh->b_data, 0, bh->b_size); 496 + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; 497 + b = new_blocks[i]; 498 + 499 + if (i == indirect_blks) 500 + len = ar.len; 501 + for (j = 0; j < len; j++) 502 + *p++ = cpu_to_le32(b++); 503 + 380 504 BUFFER_TRACE(bh, "marking uptodate"); 381 505 set_buffer_uptodate(bh); 382 506 unlock_buffer(bh); ··· 385 511 if (err) 386 512 goto failed; 387 513 } 388 - *blks = num; 389 - return err; 514 + *blks = ar.len; 515 + return 0; 390 516 failed: 391 - /* Allocation failed, free what we already allocated */ 392 - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 393 - for (i = 1; i <= n ; i++) { 394 - /* 395 - * branch[i].bh is newly allocated, so there is no 396 - * need to revoke the block, which is why we don't 397 - * need to set EXT4_FREE_BLOCKS_METADATA. 398 - */ 399 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 400 - EXT4_FREE_BLOCKS_FORGET); 517 + for (; i >= 0; i--) { 518 + if (i != indirect_blks && branch[i].bh) 519 + ext4_forget(handle, 1, inode, branch[i].bh, 520 + branch[i].bh->b_blocknr); 521 + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 522 + (i == indirect_blks) ? ar.len : 1, 0); 401 523 } 402 - for (i = n+1; i < indirect_blks; i++) 403 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 404 - 405 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); 406 - 407 524 return err; 408 525 } 409 526 ··· 806 941 * be able to restart the transaction at a conventient checkpoint to make 807 942 * sure we don't overflow the journal. 808 943 * 809 - * start_transaction gets us a new handle for a truncate transaction, 810 - * and extend_transaction tries to extend the existing one a bit. If 944 + * Try to extend this transaction for the purposes of truncation. If 811 945 * extend fails, we need to propagate the failure up and restart the 812 946 * transaction in the top-level truncate loop. --sct 813 - */ 814 - static handle_t *start_transaction(struct inode *inode) 815 - { 816 - handle_t *result; 817 - 818 - result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, 819 - ext4_blocks_for_truncate(inode)); 820 - if (!IS_ERR(result)) 821 - return result; 822 - 823 - ext4_std_error(inode->i_sb, PTR_ERR(result)); 824 - return result; 825 - } 826 - 827 - /* 828 - * Try to extend this transaction for the purposes of truncation. 829 947 * 830 948 * Returns 0 if we managed to create more room. If we can't create more 831 949 * room, and the transaction must be restarted we return 1. ··· 1201 1353 } 1202 1354 } 1203 1355 1204 - void ext4_ind_truncate(struct inode *inode) 1356 + void ext4_ind_truncate(handle_t *handle, struct inode *inode) 1205 1357 { 1206 - handle_t *handle; 1207 1358 struct ext4_inode_info *ei = EXT4_I(inode); 1208 1359 __le32 *i_data = ei->i_data; 1209 1360 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1210 - struct address_space *mapping = inode->i_mapping; 1211 1361 ext4_lblk_t offsets[4]; 1212 1362 Indirect chain[4]; 1213 1363 Indirect *partial; 1214 1364 __le32 nr = 0; 1215 1365 int n = 0; 1216 1366 ext4_lblk_t last_block, max_block; 1217 - loff_t page_len; 1218 1367 unsigned blocksize = inode->i_sb->s_blocksize; 1219 - int err; 1220 - 1221 - handle = start_transaction(inode); 1222 - if (IS_ERR(handle)) 1223 - return; /* AKPM: return what? */ 1224 1368 1225 1369 last_block = (inode->i_size + blocksize-1) 1226 1370 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1227 1371 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 1228 1372 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 1229 1373 1230 - if (inode->i_size % PAGE_CACHE_SIZE != 0) { 1231 - page_len = PAGE_CACHE_SIZE - 1232 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 1233 - 1234 - err = ext4_discard_partial_page_buffers(handle, 1235 - mapping, inode->i_size, page_len, 0); 1236 - 1237 - if (err) 1238 - goto out_stop; 1239 - } 1240 - 1241 1374 if (last_block != max_block) { 1242 1375 n = ext4_block_to_path(inode, last_block, offsets, NULL); 1243 1376 if (n == 0) 1244 - goto out_stop; /* error */ 1377 + return; 1245 1378 } 1246 1379 1247 - /* 1248 - * OK. This truncate is going to happen. We add the inode to the 1249 - * orphan list, so that if this truncate spans multiple transactions, 1250 - * and we crash, we will resume the truncate when the filesystem 1251 - * recovers. It also marks the inode dirty, to catch the new size. 1252 - * 1253 - * Implication: the file must always be in a sane, consistent 1254 - * truncatable state while each transaction commits. 1255 - */ 1256 - if (ext4_orphan_add(handle, inode)) 1257 - goto out_stop; 1258 - 1259 - /* 1260 - * From here we block out all ext4_get_block() callers who want to 1261 - * modify the block allocation tree. 1262 - */ 1263 - down_write(&ei->i_data_sem); 1264 - 1265 - ext4_discard_preallocations(inode); 1266 1380 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); 1267 1381 1268 1382 /* ··· 1241 1431 * It is unnecessary to free any data blocks if last_block is 1242 1432 * equal to the indirect block limit. 1243 1433 */ 1244 - goto out_unlock; 1434 + return; 1245 1435 } else if (n == 1) { /* direct blocks */ 1246 1436 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 1247 1437 i_data + EXT4_NDIR_BLOCKS); ··· 1301 1491 case EXT4_TIND_BLOCK: 1302 1492 ; 1303 1493 } 1304 - 1305 - out_unlock: 1306 - up_write(&ei->i_data_sem); 1307 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 1308 - ext4_mark_inode_dirty(handle, inode); 1309 - 1310 - /* 1311 - * In a multi-transaction truncate, we only make the final transaction 1312 - * synchronous 1313 - */ 1314 - if (IS_SYNC(inode)) 1315 - ext4_handle_sync(handle); 1316 - out_stop: 1317 - /* 1318 - * If this was a simple ftruncate(), and the file will remain alive 1319 - * then we need to clear up the orphan record which we created above. 1320 - * However, if this was a real unlink then we were called by 1321 - * ext4_delete_inode(), and we allow that function to clean up the 1322 - * orphan info for us. 1323 - */ 1324 - if (inode->i_nlink) 1325 - ext4_orphan_del(handle, inode); 1326 - 1327 - ext4_journal_stop(handle); 1328 - trace_ext4_truncate_exit(inode); 1329 1494 } 1330 1495 1331 1496 static int free_hole_blocks(handle_t *handle, struct inode *inode, ··· 1354 1569 return ret; 1355 1570 } 1356 1571 1357 - static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 1358 - ext4_lblk_t first, ext4_lblk_t stop) 1572 + int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 1573 + ext4_lblk_t first, ext4_lblk_t stop) 1359 1574 { 1360 1575 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1361 1576 int level, ret = 0; ··· 1389 1604 return ret; 1390 1605 } 1391 1606 1392 - int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) 1393 - { 1394 - struct inode *inode = file_inode(file); 1395 - struct super_block *sb = inode->i_sb; 1396 - ext4_lblk_t first_block, stop_block; 1397 - struct address_space *mapping = inode->i_mapping; 1398 - handle_t *handle = NULL; 1399 - loff_t first_page, last_page, page_len; 1400 - loff_t first_page_offset, last_page_offset; 1401 - int err = 0; 1402 - 1403 - /* 1404 - * Write out all dirty pages to avoid race conditions 1405 - * Then release them. 1406 - */ 1407 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 1408 - err = filemap_write_and_wait_range(mapping, 1409 - offset, offset + length - 1); 1410 - if (err) 1411 - return err; 1412 - } 1413 - 1414 - mutex_lock(&inode->i_mutex); 1415 - /* It's not possible punch hole on append only file */ 1416 - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 1417 - err = -EPERM; 1418 - goto out_mutex; 1419 - } 1420 - if (IS_SWAPFILE(inode)) { 1421 - err = -ETXTBSY; 1422 - goto out_mutex; 1423 - } 1424 - 1425 - /* No need to punch hole beyond i_size */ 1426 - if (offset >= inode->i_size) 1427 - goto out_mutex; 1428 - 1429 - /* 1430 - * If the hole extents beyond i_size, set the hole 1431 - * to end after the page that contains i_size 1432 - */ 1433 - if (offset + length > inode->i_size) { 1434 - length = inode->i_size + 1435 - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - 1436 - offset; 1437 - } 1438 - 1439 - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1440 - last_page = (offset + length) >> PAGE_CACHE_SHIFT; 1441 - 1442 - first_page_offset = first_page << PAGE_CACHE_SHIFT; 1443 - last_page_offset = last_page << PAGE_CACHE_SHIFT; 1444 - 1445 - /* Now release the pages */ 1446 - if (last_page_offset > first_page_offset) { 1447 - truncate_pagecache_range(inode, first_page_offset, 1448 - last_page_offset - 1); 1449 - } 1450 - 1451 - /* Wait all existing dio works, newcomers will block on i_mutex */ 1452 - inode_dio_wait(inode); 1453 - 1454 - handle = start_transaction(inode); 1455 - if (IS_ERR(handle)) 1456 - goto out_mutex; 1457 - 1458 - /* 1459 - * Now we need to zero out the non-page-aligned data in the 1460 - * pages at the start and tail of the hole, and unmap the buffer 1461 - * heads for the block aligned regions of the page that were 1462 - * completely zerod. 1463 - */ 1464 - if (first_page > last_page) { 1465 - /* 1466 - * If the file space being truncated is contained within a page 1467 - * just zero out and unmap the middle of that page 1468 - */ 1469 - err = ext4_discard_partial_page_buffers(handle, 1470 - mapping, offset, length, 0); 1471 - if (err) 1472 - goto out; 1473 - } else { 1474 - /* 1475 - * Zero out and unmap the paritial page that contains 1476 - * the start of the hole 1477 - */ 1478 - page_len = first_page_offset - offset; 1479 - if (page_len > 0) { 1480 - err = ext4_discard_partial_page_buffers(handle, mapping, 1481 - offset, page_len, 0); 1482 - if (err) 1483 - goto out; 1484 - } 1485 - 1486 - /* 1487 - * Zero out and unmap the partial page that contains 1488 - * the end of the hole 1489 - */ 1490 - page_len = offset + length - last_page_offset; 1491 - if (page_len > 0) { 1492 - err = ext4_discard_partial_page_buffers(handle, mapping, 1493 - last_page_offset, page_len, 0); 1494 - if (err) 1495 - goto out; 1496 - } 1497 - } 1498 - 1499 - /* 1500 - * If i_size contained in the last page, we need to 1501 - * unmap and zero the paritial page after i_size 1502 - */ 1503 - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && 1504 - inode->i_size % PAGE_CACHE_SIZE != 0) { 1505 - page_len = PAGE_CACHE_SIZE - 1506 - (inode->i_size & (PAGE_CACHE_SIZE - 1)); 1507 - if (page_len > 0) { 1508 - err = ext4_discard_partial_page_buffers(handle, 1509 - mapping, inode->i_size, page_len, 0); 1510 - if (err) 1511 - goto out; 1512 - } 1513 - } 1514 - 1515 - first_block = (offset + sb->s_blocksize - 1) >> 1516 - EXT4_BLOCK_SIZE_BITS(sb); 1517 - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 1518 - 1519 - if (first_block >= stop_block) 1520 - goto out; 1521 - 1522 - down_write(&EXT4_I(inode)->i_data_sem); 1523 - ext4_discard_preallocations(inode); 1524 - 1525 - err = ext4_es_remove_extent(inode, first_block, 1526 - stop_block - first_block); 1527 - err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); 1528 - 1529 - ext4_discard_preallocations(inode); 1530 - 1531 - if (IS_SYNC(inode)) 1532 - ext4_handle_sync(handle); 1533 - 1534 - up_write(&EXT4_I(inode)->i_data_sem); 1535 - 1536 - out: 1537 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 1538 - ext4_mark_inode_dirty(handle, inode); 1539 - ext4_journal_stop(handle); 1540 - 1541 - out_mutex: 1542 - mutex_unlock(&inode->i_mutex); 1543 - 1544 - return err; 1545 - }

+159 -19

fs/ext4/inline.c

··· 19 19 20 20 #define EXT4_XATTR_SYSTEM_DATA "data" 21 21 #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) 22 - #define EXT4_INLINE_DOTDOT_SIZE 4 22 + #define EXT4_INLINE_DOTDOT_OFFSET 2 23 + #define EXT4_INLINE_DOTDOT_SIZE 4 23 24 24 25 int ext4_get_inline_size(struct inode *inode) 25 26 { ··· 1290 1289 return ret; 1291 1290 } 1292 1291 1292 + /* 1293 + * This function fills a red-black tree with information from an 1294 + * inlined dir. It returns the number directory entries loaded 1295 + * into the tree. If there is an error it is returned in err. 1296 + */ 1297 + int htree_inlinedir_to_tree(struct file *dir_file, 1298 + struct inode *dir, ext4_lblk_t block, 1299 + struct dx_hash_info *hinfo, 1300 + __u32 start_hash, __u32 start_minor_hash, 1301 + int *has_inline_data) 1302 + { 1303 + int err = 0, count = 0; 1304 + unsigned int parent_ino; 1305 + int pos; 1306 + struct ext4_dir_entry_2 *de; 1307 + struct inode *inode = file_inode(dir_file); 1308 + int ret, inline_size = 0; 1309 + struct ext4_iloc iloc; 1310 + void *dir_buf = NULL; 1311 + struct ext4_dir_entry_2 fake; 1312 + 1313 + ret = ext4_get_inode_loc(inode, &iloc); 1314 + if (ret) 1315 + return ret; 1316 + 1317 + down_read(&EXT4_I(inode)->xattr_sem); 1318 + if (!ext4_has_inline_data(inode)) { 1319 + up_read(&EXT4_I(inode)->xattr_sem); 1320 + *has_inline_data = 0; 1321 + goto out; 1322 + } 1323 + 1324 + inline_size = ext4_get_inline_size(inode); 1325 + dir_buf = kmalloc(inline_size, GFP_NOFS); 1326 + if (!dir_buf) { 1327 + ret = -ENOMEM; 1328 + up_read(&EXT4_I(inode)->xattr_sem); 1329 + goto out; 1330 + } 1331 + 1332 + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); 1333 + up_read(&EXT4_I(inode)->xattr_sem); 1334 + if (ret < 0) 1335 + goto out; 1336 + 1337 + pos = 0; 1338 + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1339 + while (pos < inline_size) { 1340 + /* 1341 + * As inlined dir doesn't store any information about '.' and 1342 + * only the inode number of '..' is stored, we have to handle 1343 + * them differently. 1344 + */ 1345 + if (pos == 0) { 1346 + fake.inode = cpu_to_le32(inode->i_ino); 1347 + fake.name_len = 1; 1348 + strcpy(fake.name, "."); 1349 + fake.rec_len = ext4_rec_len_to_disk( 1350 + EXT4_DIR_REC_LEN(fake.name_len), 1351 + inline_size); 1352 + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1353 + de = &fake; 1354 + pos = EXT4_INLINE_DOTDOT_OFFSET; 1355 + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { 1356 + fake.inode = cpu_to_le32(parent_ino); 1357 + fake.name_len = 2; 1358 + strcpy(fake.name, ".."); 1359 + fake.rec_len = ext4_rec_len_to_disk( 1360 + EXT4_DIR_REC_LEN(fake.name_len), 1361 + inline_size); 1362 + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); 1363 + de = &fake; 1364 + pos = EXT4_INLINE_DOTDOT_SIZE; 1365 + } else { 1366 + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); 1367 + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); 1368 + if (ext4_check_dir_entry(inode, dir_file, de, 1369 + iloc.bh, dir_buf, 1370 + inline_size, pos)) { 1371 + ret = count; 1372 + goto out; 1373 + } 1374 + } 1375 + 1376 + ext4fs_dirhash(de->name, de->name_len, hinfo); 1377 + if ((hinfo->hash < start_hash) || 1378 + ((hinfo->hash == start_hash) && 1379 + (hinfo->minor_hash < start_minor_hash))) 1380 + continue; 1381 + if (de->inode == 0) 1382 + continue; 1383 + err = ext4_htree_store_dirent(dir_file, 1384 + hinfo->hash, hinfo->minor_hash, de); 1385 + if (err) { 1386 + count = err; 1387 + goto out; 1388 + } 1389 + count++; 1390 + } 1391 + ret = count; 1392 + out: 1393 + kfree(dir_buf); 1394 + brelse(iloc.bh); 1395 + return ret; 1396 + } 1397 + 1398 + /* 1399 + * So this function is called when the volume is mkfsed with 1400 + * dir_index disabled. In order to keep f_pos persistent 1401 + * after we convert from an inlined dir to a blocked based, 1402 + * we just pretend that we are a normal dir and return the 1403 + * offset as if '.' and '..' really take place. 1404 + * 1405 + */ 1293 1406 int ext4_read_inline_dir(struct file *filp, 1294 1407 void *dirent, filldir_t filldir, 1295 1408 int *has_inline_data) ··· 1417 1302 int ret, inline_size = 0; 1418 1303 struct ext4_iloc iloc; 1419 1304 void *dir_buf = NULL; 1305 + int dotdot_offset, dotdot_size, extra_offset, extra_size; 1420 1306 1421 1307 ret = ext4_get_inode_loc(inode, &iloc); 1422 1308 if (ret) ··· 1446 1330 sb = inode->i_sb; 1447 1331 stored = 0; 1448 1332 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1333 + offset = filp->f_pos; 1449 1334 1450 - while (!error && !stored && filp->f_pos < inode->i_size) { 1335 + /* 1336 + * dotdot_offset and dotdot_size is the real offset and 1337 + * size for ".." and "." if the dir is block based while 1338 + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. 1339 + * So we will use extra_offset and extra_size to indicate them 1340 + * during the inline dir iteration. 1341 + */ 1342 + dotdot_offset = EXT4_DIR_REC_LEN(1); 1343 + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); 1344 + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1345 + extra_size = extra_offset + inline_size; 1346 + 1347 + while (!error && !stored && filp->f_pos < extra_size) { 1451 1348 revalidate: 1452 1349 /* 1453 1350 * If the version has changed since the last call to ··· 1469 1340 * dir to make sure. 1470 1341 */ 1471 1342 if (filp->f_version != inode->i_version) { 1472 - for (i = 0; 1473 - i < inode->i_size && i < offset;) { 1343 + for (i = 0; i < extra_size && i < offset;) { 1344 + /* 1345 + * "." is with offset 0 and 1346 + * ".." is dotdot_offset. 1347 + */ 1474 1348 if (!i) { 1475 - /* skip "." and ".." if needed. */ 1476 - i += EXT4_INLINE_DOTDOT_SIZE; 1349 + i = dotdot_offset; 1350 + continue; 1351 + } else if (i == dotdot_offset) { 1352 + i = dotdot_size; 1477 1353 continue; 1478 1354 } 1355 + /* for other entry, the real offset in 1356 + * the buf has to be tuned accordingly. 1357 + */ 1479 1358 de = (struct ext4_dir_entry_2 *) 1480 - (dir_buf + i); 1359 + (dir_buf + i - extra_offset); 1481 1360 /* It's too expensive to do a full 1482 1361 * dirent test each time round this 1483 1362 * loop, but we do have to test at ··· 1493 1356 * failure will be detected in the 1494 1357 * dirent test below. */ 1495 1358 if (ext4_rec_len_from_disk(de->rec_len, 1496 - inline_size) < EXT4_DIR_REC_LEN(1)) 1359 + extra_size) < EXT4_DIR_REC_LEN(1)) 1497 1360 break; 1498 1361 i += ext4_rec_len_from_disk(de->rec_len, 1499 - inline_size); 1362 + extra_size); 1500 1363 } 1501 1364 offset = i; 1502 1365 filp->f_pos = offset; 1503 1366 filp->f_version = inode->i_version; 1504 1367 } 1505 1368 1506 - while (!error && filp->f_pos < inode->i_size) { 1369 + while (!error && filp->f_pos < extra_size) { 1507 1370 if (filp->f_pos == 0) { 1508 1371 error = filldir(dirent, ".", 1, 0, inode->i_ino, 1509 1372 DT_DIR); 1510 1373 if (error) 1511 1374 break; 1512 1375 stored++; 1376 + filp->f_pos = dotdot_offset; 1377 + continue; 1378 + } 1513 1379 1514 - error = filldir(dirent, "..", 2, 0, parent_ino, 1515 - DT_DIR); 1380 + if (filp->f_pos == dotdot_offset) { 1381 + error = filldir(dirent, "..", 2, 1382 + dotdot_offset, 1383 + parent_ino, DT_DIR); 1516 1384 if (error) 1517 1385 break; 1518 1386 stored++; 1519 1387 1520 - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; 1388 + filp->f_pos = dotdot_size; 1521 1389 continue; 1522 1390 } 1523 1391 1524 - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); 1392 + de = (struct ext4_dir_entry_2 *) 1393 + (dir_buf + filp->f_pos - extra_offset); 1525 1394 if (ext4_check_dir_entry(inode, filp, de, 1526 1395 iloc.bh, dir_buf, 1527 - inline_size, offset)) { 1396 + extra_size, filp->f_pos)) { 1528 1397 ret = stored; 1529 1398 goto out; 1530 1399 } 1531 - offset += ext4_rec_len_from_disk(de->rec_len, 1532 - inline_size); 1533 1400 if (le32_to_cpu(de->inode)) { 1534 1401 /* We might block in the next section 1535 1402 * if the data destination is ··· 1556 1415 stored++; 1557 1416 } 1558 1417 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 1559 - inline_size); 1418 + extra_size); 1560 1419 } 1561 - offset = 0; 1562 1420 } 1563 1421 out: 1564 1422 kfree(dir_buf);

+388 -192

fs/ext4/inode.c

··· 55 55 __u16 csum_hi = 0; 56 56 __u32 csum; 57 57 58 - csum_lo = raw->i_checksum_lo; 58 + csum_lo = le16_to_cpu(raw->i_checksum_lo); 59 59 raw->i_checksum_lo = 0; 60 60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 61 61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 62 - csum_hi = raw->i_checksum_hi; 62 + csum_hi = le16_to_cpu(raw->i_checksum_hi); 63 63 raw->i_checksum_hi = 0; 64 64 } 65 65 66 66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, 67 67 EXT4_INODE_SIZE(inode->i_sb)); 68 68 69 - raw->i_checksum_lo = csum_lo; 69 + raw->i_checksum_lo = cpu_to_le16(csum_lo); 70 70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 71 71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 72 - raw->i_checksum_hi = csum_hi; 72 + raw->i_checksum_hi = cpu_to_le16(csum_hi); 73 73 74 74 return csum; 75 75 } ··· 210 210 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 211 211 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 212 212 213 - jbd2_log_start_commit(journal, commit_tid); 214 - jbd2_log_wait_commit(journal, commit_tid); 213 + jbd2_complete_transaction(journal, commit_tid); 215 214 filemap_write_and_wait(&inode->i_data); 216 215 } 217 216 truncate_inode_pages(&inode->i_data, 0); ··· 1080 1081 /* For write_end() in data=journal mode */ 1081 1082 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1082 1083 { 1084 + int ret; 1083 1085 if (!buffer_mapped(bh) || buffer_freed(bh)) 1084 1086 return 0; 1085 1087 set_buffer_uptodate(bh); 1086 - return ext4_handle_dirty_metadata(handle, NULL, bh); 1088 + ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1089 + clear_buffer_meta(bh); 1090 + clear_buffer_prio(bh); 1091 + return ret; 1087 1092 } 1088 1093 1089 - static int ext4_generic_write_end(struct file *file, 1090 - struct address_space *mapping, 1091 - loff_t pos, unsigned len, unsigned copied, 1092 - struct page *page, void *fsdata) 1094 + /* 1095 + * We need to pick up the new inode size which generic_commit_write gave us 1096 + * `file' can be NULL - eg, when called from page_symlink(). 1097 + * 1098 + * ext4 never places buffers on inode->i_mapping->private_list. metadata 1099 + * buffers are managed internally. 1100 + */ 1101 + static int ext4_write_end(struct file *file, 1102 + struct address_space *mapping, 1103 + loff_t pos, unsigned len, unsigned copied, 1104 + struct page *page, void *fsdata) 1093 1105 { 1094 - int i_size_changed = 0; 1095 - struct inode *inode = mapping->host; 1096 1106 handle_t *handle = ext4_journal_current_handle(); 1107 + struct inode *inode = mapping->host; 1108 + int ret = 0, ret2; 1109 + int i_size_changed = 0; 1110 + 1111 + trace_ext4_write_end(inode, pos, len, copied); 1112 + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { 1113 + ret = ext4_jbd2_file_inode(handle, inode); 1114 + if (ret) { 1115 + unlock_page(page); 1116 + page_cache_release(page); 1117 + goto errout; 1118 + } 1119 + } 1097 1120 1098 1121 if (ext4_has_inline_data(inode)) 1099 1122 copied = ext4_write_inline_data_end(inode, pos, len, ··· 1126 1105 1127 1106 /* 1128 1107 * No need to use i_size_read() here, the i_size 1129 - * cannot change under us because we hold i_mutex. 1108 + * cannot change under us because we hole i_mutex. 1130 1109 * 1131 1110 * But it's important to update i_size while still holding page lock: 1132 1111 * page writeout could otherwise come in and zero beyond i_size. ··· 1136 1115 i_size_changed = 1; 1137 1116 } 1138 1117 1139 - if (pos + copied > EXT4_I(inode)->i_disksize) { 1118 + if (pos + copied > EXT4_I(inode)->i_disksize) { 1140 1119 /* We need to mark inode dirty even if 1141 1120 * new_i_size is less that inode->i_size 1142 - * bu greater than i_disksize.(hint delalloc) 1121 + * but greater than i_disksize. (hint delalloc) 1143 1122 */ 1144 1123 ext4_update_i_disksize(inode, (pos + copied)); 1145 1124 i_size_changed = 1; ··· 1156 1135 if (i_size_changed) 1157 1136 ext4_mark_inode_dirty(handle, inode); 1158 1137 1159 - return copied; 1160 - } 1161 - 1162 - /* 1163 - * We need to pick up the new inode size which generic_commit_write gave us 1164 - * `file' can be NULL - eg, when called from page_symlink(). 1165 - * 1166 - * ext4 never places buffers on inode->i_mapping->private_list. metadata 1167 - * buffers are managed internally. 1168 - */ 1169 - static int ext4_ordered_write_end(struct file *file, 1170 - struct address_space *mapping, 1171 - loff_t pos, unsigned len, unsigned copied, 1172 - struct page *page, void *fsdata) 1173 - { 1174 - handle_t *handle = ext4_journal_current_handle(); 1175 - struct inode *inode = mapping->host; 1176 - int ret = 0, ret2; 1177 - 1178 - trace_ext4_ordered_write_end(inode, pos, len, copied); 1179 - ret = ext4_jbd2_file_inode(handle, inode); 1180 - 1181 - if (ret == 0) { 1182 - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1183 - page, fsdata); 1184 - copied = ret2; 1185 - if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1186 - /* if we have allocated more blocks and copied 1187 - * less. We will have blocks allocated outside 1188 - * inode->i_size. So truncate them 1189 - */ 1190 - ext4_orphan_add(handle, inode); 1191 - if (ret2 < 0) 1192 - ret = ret2; 1193 - } else { 1194 - unlock_page(page); 1195 - page_cache_release(page); 1196 - } 1197 - 1198 - ret2 = ext4_journal_stop(handle); 1199 - if (!ret) 1200 - ret = ret2; 1201 - 1202 - if (pos + len > inode->i_size) { 1203 - ext4_truncate_failed_write(inode); 1204 - /* 1205 - * If truncate failed early the inode might still be 1206 - * on the orphan list; we need to make sure the inode 1207 - * is removed from the orphan list in that case. 1208 - */ 1209 - if (inode->i_nlink) 1210 - ext4_orphan_del(NULL, inode); 1211 - } 1212 - 1213 - 1214 - return ret ? ret : copied; 1215 - } 1216 - 1217 - static int ext4_writeback_write_end(struct file *file, 1218 - struct address_space *mapping, 1219 - loff_t pos, unsigned len, unsigned copied, 1220 - struct page *page, void *fsdata) 1221 - { 1222 - handle_t *handle = ext4_journal_current_handle(); 1223 - struct inode *inode = mapping->host; 1224 - int ret = 0, ret2; 1225 - 1226 - trace_ext4_writeback_write_end(inode, pos, len, copied); 1227 - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1228 - page, fsdata); 1229 - copied = ret2; 1138 + if (copied < 0) 1139 + ret = copied; 1230 1140 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1231 1141 /* if we have allocated more blocks and copied 1232 1142 * less. We will have blocks allocated outside 1233 1143 * inode->i_size. So truncate them 1234 1144 */ 1235 1145 ext4_orphan_add(handle, inode); 1236 - 1237 - if (ret2 < 0) 1238 - ret = ret2; 1239 - 1146 + errout: 1240 1147 ret2 = ext4_journal_stop(handle); 1241 1148 if (!ret) 1242 1149 ret = ret2; ··· 1487 1538 struct ext4_io_submit io_submit; 1488 1539 1489 1540 BUG_ON(mpd->next_page <= mpd->first_page); 1490 - memset(&io_submit, 0, sizeof(io_submit)); 1541 + ext4_io_submit_init(&io_submit, mpd->wbc); 1542 + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 1543 + if (!io_submit.io_end) 1544 + return -ENOMEM; 1491 1545 /* 1492 1546 * We need to start from the first_page to the next_page - 1 1493 1547 * to make sure we also write the mapped dirty buffer_heads. ··· 1578 1626 pagevec_release(&pvec); 1579 1627 } 1580 1628 ext4_io_submit(&io_submit); 1629 + /* Drop io_end reference we got from init */ 1630 + ext4_put_io_end_defer(io_submit.io_end); 1581 1631 return ret; 1582 1632 } 1583 1633 ··· 1624 1670 { 1625 1671 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1626 1672 struct super_block *sb = inode->i_sb; 1673 + struct ext4_inode_info *ei = EXT4_I(inode); 1627 1674 1628 1675 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1629 1676 EXT4_C2B(EXT4_SB(inode->i_sb), 1630 - ext4_count_free_clusters(inode->i_sb))); 1677 + ext4_count_free_clusters(sb))); 1631 1678 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1632 1679 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1633 - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1680 + (long long) EXT4_C2B(EXT4_SB(sb), 1634 1681 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1635 1682 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1636 - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1683 + (long long) EXT4_C2B(EXT4_SB(sb), 1637 1684 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1638 1685 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1639 1686 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1640 - EXT4_I(inode)->i_reserved_data_blocks); 1687 + ei->i_reserved_data_blocks); 1641 1688 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", 1642 - EXT4_I(inode)->i_reserved_meta_blocks); 1689 + ei->i_reserved_meta_blocks); 1690 + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", 1691 + ei->i_allocated_meta_blocks); 1643 1692 return; 1644 1693 } 1645 1694 ··· 1697 1740 */ 1698 1741 map.m_lblk = next; 1699 1742 map.m_len = max_blocks; 1700 - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 1743 + /* 1744 + * We're in delalloc path and it is possible that we're going to 1745 + * need more metadata blocks than previously reserved. However 1746 + * we must not fail because we're in writeback and there is 1747 + * nothing we can do about it so it might result in data loss. 1748 + * So use reserved blocks to allocate metadata if possible. 1749 + */ 1750 + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 1751 + EXT4_GET_BLOCKS_METADATA_NOFAIL; 1701 1752 if (ext4_should_dioread_nolock(mpd->inode)) 1702 1753 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 1703 1754 if (mpd->b_state & (1 << BH_Delay)) 1704 1755 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 1756 + 1705 1757 1706 1758 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 1707 1759 if (blks < 0) { ··· 2238 2272 */ 2239 2273 return __ext4_journalled_writepage(page, len); 2240 2274 2241 - memset(&io_submit, 0, sizeof(io_submit)); 2275 + ext4_io_submit_init(&io_submit, wbc); 2276 + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2277 + if (!io_submit.io_end) { 2278 + redirty_page_for_writepage(wbc, page); 2279 + return -ENOMEM; 2280 + } 2242 2281 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 2243 2282 ext4_io_submit(&io_submit); 2283 + /* Drop io_end reference we got from init */ 2284 + ext4_put_io_end_defer(io_submit.io_end); 2244 2285 return ret; 2245 2286 } 2246 2287 ··· 2634 2661 2635 2662 static int ext4_nonda_switch(struct super_block *sb) 2636 2663 { 2637 - s64 free_blocks, dirty_blocks; 2664 + s64 free_clusters, dirty_clusters; 2638 2665 struct ext4_sb_info *sbi = EXT4_SB(sb); 2639 2666 2640 2667 /* ··· 2645 2672 * Delalloc need an accurate free block accounting. So switch 2646 2673 * to non delalloc when we are near to error range. 2647 2674 */ 2648 - free_blocks = EXT4_C2B(sbi, 2649 - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2650 - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2675 + free_clusters = 2676 + percpu_counter_read_positive(&sbi->s_freeclusters_counter); 2677 + dirty_clusters = 2678 + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2651 2679 /* 2652 2680 * Start pushing delalloc when 1/2 of free blocks are dirty. 2653 2681 */ 2654 - if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) 2682 + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) 2655 2683 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2656 2684 2657 - if (2 * free_blocks < 3 * dirty_blocks || 2658 - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2685 + if (2 * free_clusters < 3 * dirty_clusters || 2686 + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { 2659 2687 /* 2660 2688 * free block count is less than 150% of dirty blocks 2661 2689 * or free blocks is less than watermark ··· 2792 2818 unsigned long start, end; 2793 2819 int write_mode = (int)(unsigned long)fsdata; 2794 2820 2795 - if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2796 - switch (ext4_inode_journal_mode(inode)) { 2797 - case EXT4_INODE_ORDERED_DATA_MODE: 2798 - return ext4_ordered_write_end(file, mapping, pos, 2799 - len, copied, page, fsdata); 2800 - case EXT4_INODE_WRITEBACK_DATA_MODE: 2801 - return ext4_writeback_write_end(file, mapping, pos, 2802 - len, copied, page, fsdata); 2803 - default: 2804 - BUG(); 2805 - } 2806 - } 2821 + if (write_mode == FALL_BACK_TO_NONDELALLOC) 2822 + return ext4_write_end(file, mapping, pos, 2823 + len, copied, page, fsdata); 2807 2824 2808 2825 trace_ext4_da_write_end(inode, pos, len, copied); 2809 2826 start = pos & (PAGE_CACHE_SIZE - 1); ··· 3078 3113 struct inode *inode = file_inode(iocb->ki_filp); 3079 3114 ext4_io_end_t *io_end = iocb->private; 3080 3115 3081 - /* if not async direct IO or dio with 0 bytes write, just return */ 3082 - if (!io_end || !size) 3083 - goto out; 3116 + /* if not async direct IO just return */ 3117 + if (!io_end) { 3118 + inode_dio_done(inode); 3119 + if (is_async) 3120 + aio_complete(iocb, ret, 0); 3121 + return; 3122 + } 3084 3123 3085 3124 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3086 3125 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", ··· 3092 3123 size); 3093 3124 3094 3125 iocb->private = NULL; 3095 - 3096 - /* if not aio dio with unwritten extents, just free io and return */ 3097 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 3098 - ext4_free_io_end(io_end); 3099 - out: 3100 - inode_dio_done(inode); 3101 - if (is_async) 3102 - aio_complete(iocb, ret, 0); 3103 - return; 3104 - } 3105 - 3106 3126 io_end->offset = offset; 3107 3127 io_end->size = size; 3108 3128 if (is_async) { 3109 3129 io_end->iocb = iocb; 3110 3130 io_end->result = ret; 3111 3131 } 3112 - 3113 - ext4_add_complete_io(io_end); 3132 + ext4_put_io_end_defer(io_end); 3114 3133 } 3115 3134 3116 3135 /* ··· 3132 3175 get_block_t *get_block_func = NULL; 3133 3176 int dio_flags = 0; 3134 3177 loff_t final_size = offset + count; 3178 + ext4_io_end_t *io_end = NULL; 3135 3179 3136 3180 /* Use the old path for reads and writes beyond i_size. */ 3137 3181 if (rw != WRITE || final_size > inode->i_size) ··· 3171 3213 iocb->private = NULL; 3172 3214 ext4_inode_aio_set(inode, NULL); 3173 3215 if (!is_sync_kiocb(iocb)) { 3174 - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3216 + io_end = ext4_init_io_end(inode, GFP_NOFS); 3175 3217 if (!io_end) { 3176 3218 ret = -ENOMEM; 3177 3219 goto retake_lock; 3178 3220 } 3179 3221 io_end->flag |= EXT4_IO_END_DIRECT; 3180 - iocb->private = io_end; 3222 + /* 3223 + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() 3224 + */ 3225 + iocb->private = ext4_get_io_end(io_end); 3181 3226 /* 3182 3227 * we save the io structure for current async direct 3183 3228 * IO, so that later ext4_map_blocks() could flag the ··· 3204 3243 NULL, 3205 3244 dio_flags); 3206 3245 3207 - if (iocb->private) 3208 - ext4_inode_aio_set(inode, NULL); 3209 3246 /* 3210 - * The io_end structure takes a reference to the inode, that 3211 - * structure needs to be destroyed and the reference to the 3212 - * inode need to be dropped, when IO is complete, even with 0 3213 - * byte write, or failed. 3214 - * 3215 - * In the successful AIO DIO case, the io_end structure will 3216 - * be destroyed and the reference to the inode will be dropped 3217 - * after the end_io call back function is called. 3218 - * 3219 - * In the case there is 0 byte write, or error case, since VFS 3220 - * direct IO won't invoke the end_io call back function, we 3221 - * need to free the end_io structure here. 3247 + * Put our reference to io_end. This can free the io_end structure e.g. 3248 + * in sync IO case or in case of error. It can even perform extent 3249 + * conversion if all bios we submitted finished before we got here. 3250 + * Note that in that case iocb->private can be already set to NULL 3251 + * here. 3222 3252 */ 3223 - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3224 - ext4_free_io_end(iocb->private); 3225 - iocb->private = NULL; 3226 - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3253 + if (io_end) { 3254 + ext4_inode_aio_set(inode, NULL); 3255 + ext4_put_io_end(io_end); 3256 + /* 3257 + * In case of error or no write ext4_end_io_dio() was not 3258 + * called so we have to put iocb's reference. 3259 + */ 3260 + if (ret <= 0 && ret != -EIOCBQUEUED) { 3261 + WARN_ON(iocb->private != io_end); 3262 + ext4_put_io_end(io_end); 3263 + iocb->private = NULL; 3264 + } 3265 + } 3266 + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3227 3267 EXT4_STATE_DIO_UNWRITTEN)) { 3228 3268 int err; 3229 3269 /* ··· 3296 3334 return __set_page_dirty_nobuffers(page); 3297 3335 } 3298 3336 3299 - static const struct address_space_operations ext4_ordered_aops = { 3337 + static const struct address_space_operations ext4_aops = { 3300 3338 .readpage = ext4_readpage, 3301 3339 .readpages = ext4_readpages, 3302 3340 .writepage = ext4_writepage, 3303 3341 .write_begin = ext4_write_begin, 3304 - .write_end = ext4_ordered_write_end, 3305 - .bmap = ext4_bmap, 3306 - .invalidatepage = ext4_invalidatepage, 3307 - .releasepage = ext4_releasepage, 3308 - .direct_IO = ext4_direct_IO, 3309 - .migratepage = buffer_migrate_page, 3310 - .is_partially_uptodate = block_is_partially_uptodate, 3311 - .error_remove_page = generic_error_remove_page, 3312 - }; 3313 - 3314 - static const struct address_space_operations ext4_writeback_aops = { 3315 - .readpage = ext4_readpage, 3316 - .readpages = ext4_readpages, 3317 - .writepage = ext4_writepage, 3318 - .write_begin = ext4_write_begin, 3319 - .write_end = ext4_writeback_write_end, 3342 + .write_end = ext4_write_end, 3320 3343 .bmap = ext4_bmap, 3321 3344 .invalidatepage = ext4_invalidatepage, 3322 3345 .releasepage = ext4_releasepage, ··· 3346 3399 { 3347 3400 switch (ext4_inode_journal_mode(inode)) { 3348 3401 case EXT4_INODE_ORDERED_DATA_MODE: 3349 - if (test_opt(inode->i_sb, DELALLOC)) 3350 - inode->i_mapping->a_ops = &ext4_da_aops; 3351 - else 3352 - inode->i_mapping->a_ops = &ext4_ordered_aops; 3402 + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); 3353 3403 break; 3354 3404 case EXT4_INODE_WRITEBACK_DATA_MODE: 3355 - if (test_opt(inode->i_sb, DELALLOC)) 3356 - inode->i_mapping->a_ops = &ext4_da_aops; 3357 - else 3358 - inode->i_mapping->a_ops = &ext4_writeback_aops; 3405 + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); 3359 3406 break; 3360 3407 case EXT4_INODE_JOURNAL_DATA_MODE: 3361 3408 inode->i_mapping->a_ops = &ext4_journalled_aops; 3362 - break; 3409 + return; 3363 3410 default: 3364 3411 BUG(); 3365 3412 } 3413 + if (test_opt(inode->i_sb, DELALLOC)) 3414 + inode->i_mapping->a_ops = &ext4_da_aops; 3415 + else 3416 + inode->i_mapping->a_ops = &ext4_aops; 3366 3417 } 3367 3418 3368 3419 ··· 3591 3646 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3592 3647 { 3593 3648 struct inode *inode = file_inode(file); 3649 + struct super_block *sb = inode->i_sb; 3650 + ext4_lblk_t first_block, stop_block; 3651 + struct address_space *mapping = inode->i_mapping; 3652 + loff_t first_page, last_page, page_len; 3653 + loff_t first_page_offset, last_page_offset; 3654 + handle_t *handle; 3655 + unsigned int credits; 3656 + int ret = 0; 3657 + 3594 3658 if (!S_ISREG(inode->i_mode)) 3595 3659 return -EOPNOTSUPP; 3596 3660 3597 - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3598 - return ext4_ind_punch_hole(file, offset, length); 3599 - 3600 - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { 3661 + if (EXT4_SB(sb)->s_cluster_ratio > 1) { 3601 3662 /* TODO: Add support for bigalloc file systems */ 3602 3663 return -EOPNOTSUPP; 3603 3664 } 3604 3665 3605 3666 trace_ext4_punch_hole(inode, offset, length); 3606 3667 3607 - return ext4_ext_punch_hole(file, offset, length); 3668 + /* 3669 + * Write out all dirty pages to avoid race conditions 3670 + * Then release them. 3671 + */ 3672 + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 3673 + ret = filemap_write_and_wait_range(mapping, offset, 3674 + offset + length - 1); 3675 + if (ret) 3676 + return ret; 3677 + } 3678 + 3679 + mutex_lock(&inode->i_mutex); 3680 + /* It's not possible punch hole on append only file */ 3681 + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 3682 + ret = -EPERM; 3683 + goto out_mutex; 3684 + } 3685 + if (IS_SWAPFILE(inode)) { 3686 + ret = -ETXTBSY; 3687 + goto out_mutex; 3688 + } 3689 + 3690 + /* No need to punch hole beyond i_size */ 3691 + if (offset >= inode->i_size) 3692 + goto out_mutex; 3693 + 3694 + /* 3695 + * If the hole extends beyond i_size, set the hole 3696 + * to end after the page that contains i_size 3697 + */ 3698 + if (offset + length > inode->i_size) { 3699 + length = inode->i_size + 3700 + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - 3701 + offset; 3702 + } 3703 + 3704 + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3705 + last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3706 + 3707 + first_page_offset = first_page << PAGE_CACHE_SHIFT; 3708 + last_page_offset = last_page << PAGE_CACHE_SHIFT; 3709 + 3710 + /* Now release the pages */ 3711 + if (last_page_offset > first_page_offset) { 3712 + truncate_pagecache_range(inode, first_page_offset, 3713 + last_page_offset - 1); 3714 + } 3715 + 3716 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 3717 + ext4_inode_block_unlocked_dio(inode); 3718 + ret = ext4_flush_unwritten_io(inode); 3719 + if (ret) 3720 + goto out_dio; 3721 + inode_dio_wait(inode); 3722 + 3723 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3724 + credits = ext4_writepage_trans_blocks(inode); 3725 + else 3726 + credits = ext4_blocks_for_truncate(inode); 3727 + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 3728 + if (IS_ERR(handle)) { 3729 + ret = PTR_ERR(handle); 3730 + ext4_std_error(sb, ret); 3731 + goto out_dio; 3732 + } 3733 + 3734 + /* 3735 + * Now we need to zero out the non-page-aligned data in the 3736 + * pages at the start and tail of the hole, and unmap the 3737 + * buffer heads for the block aligned regions of the page that 3738 + * were completely zeroed. 3739 + */ 3740 + if (first_page > last_page) { 3741 + /* 3742 + * If the file space being truncated is contained 3743 + * within a page just zero out and unmap the middle of 3744 + * that page 3745 + */ 3746 + ret = ext4_discard_partial_page_buffers(handle, 3747 + mapping, offset, length, 0); 3748 + 3749 + if (ret) 3750 + goto out_stop; 3751 + } else { 3752 + /* 3753 + * zero out and unmap the partial page that contains 3754 + * the start of the hole 3755 + */ 3756 + page_len = first_page_offset - offset; 3757 + if (page_len > 0) { 3758 + ret = ext4_discard_partial_page_buffers(handle, mapping, 3759 + offset, page_len, 0); 3760 + if (ret) 3761 + goto out_stop; 3762 + } 3763 + 3764 + /* 3765 + * zero out and unmap the partial page that contains 3766 + * the end of the hole 3767 + */ 3768 + page_len = offset + length - last_page_offset; 3769 + if (page_len > 0) { 3770 + ret = ext4_discard_partial_page_buffers(handle, mapping, 3771 + last_page_offset, page_len, 0); 3772 + if (ret) 3773 + goto out_stop; 3774 + } 3775 + } 3776 + 3777 + /* 3778 + * If i_size is contained in the last page, we need to 3779 + * unmap and zero the partial page after i_size 3780 + */ 3781 + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && 3782 + inode->i_size % PAGE_CACHE_SIZE != 0) { 3783 + page_len = PAGE_CACHE_SIZE - 3784 + (inode->i_size & (PAGE_CACHE_SIZE - 1)); 3785 + 3786 + if (page_len > 0) { 3787 + ret = ext4_discard_partial_page_buffers(handle, 3788 + mapping, inode->i_size, page_len, 0); 3789 + 3790 + if (ret) 3791 + goto out_stop; 3792 + } 3793 + } 3794 + 3795 + first_block = (offset + sb->s_blocksize - 1) >> 3796 + EXT4_BLOCK_SIZE_BITS(sb); 3797 + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 3798 + 3799 + /* If there are no blocks to remove, return now */ 3800 + if (first_block >= stop_block) 3801 + goto out_stop; 3802 + 3803 + down_write(&EXT4_I(inode)->i_data_sem); 3804 + ext4_discard_preallocations(inode); 3805 + 3806 + ret = ext4_es_remove_extent(inode, first_block, 3807 + stop_block - first_block); 3808 + if (ret) { 3809 + up_write(&EXT4_I(inode)->i_data_sem); 3810 + goto out_stop; 3811 + } 3812 + 3813 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3814 + ret = ext4_ext_remove_space(inode, first_block, 3815 + stop_block - 1); 3816 + else 3817 + ret = ext4_free_hole_blocks(handle, inode, first_block, 3818 + stop_block); 3819 + 3820 + ext4_discard_preallocations(inode); 3821 + up_write(&EXT4_I(inode)->i_data_sem); 3822 + if (IS_SYNC(inode)) 3823 + ext4_handle_sync(handle); 3824 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3825 + ext4_mark_inode_dirty(handle, inode); 3826 + out_stop: 3827 + ext4_journal_stop(handle); 3828 + out_dio: 3829 + ext4_inode_resume_unlocked_dio(inode); 3830 + out_mutex: 3831 + mutex_unlock(&inode->i_mutex); 3832 + return ret; 3608 3833 } 3609 3834 3610 3835 /* ··· 3807 3692 */ 3808 3693 void ext4_truncate(struct inode *inode) 3809 3694 { 3695 + struct ext4_inode_info *ei = EXT4_I(inode); 3696 + unsigned int credits; 3697 + handle_t *handle; 3698 + struct address_space *mapping = inode->i_mapping; 3699 + loff_t page_len; 3700 + 3701 + /* 3702 + * There is a possibility that we're either freeing the inode 3703 + * or it completely new indode. In those cases we might not 3704 + * have i_mutex locked because it's not necessary. 3705 + */ 3706 + if (!(inode->i_state & (I_NEW|I_FREEING))) 3707 + WARN_ON(!mutex_is_locked(&inode->i_mutex)); 3810 3708 trace_ext4_truncate_enter(inode); 3811 3709 3812 3710 if (!ext4_can_truncate(inode)) ··· 3838 3710 return; 3839 3711 } 3840 3712 3713 + /* 3714 + * finish any pending end_io work so we won't run the risk of 3715 + * converting any truncated blocks to initialized later 3716 + */ 3717 + ext4_flush_unwritten_io(inode); 3718 + 3841 3719 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3842 - ext4_ext_truncate(inode); 3720 + credits = ext4_writepage_trans_blocks(inode); 3843 3721 else 3844 - ext4_ind_truncate(inode); 3722 + credits = ext4_blocks_for_truncate(inode); 3723 + 3724 + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 3725 + if (IS_ERR(handle)) { 3726 + ext4_std_error(inode->i_sb, PTR_ERR(handle)); 3727 + return; 3728 + } 3729 + 3730 + if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3731 + page_len = PAGE_CACHE_SIZE - 3732 + (inode->i_size & (PAGE_CACHE_SIZE - 1)); 3733 + 3734 + if (ext4_discard_partial_page_buffers(handle, 3735 + mapping, inode->i_size, page_len, 0)) 3736 + goto out_stop; 3737 + } 3738 + 3739 + /* 3740 + * We add the inode to the orphan list, so that if this 3741 + * truncate spans multiple transactions, and we crash, we will 3742 + * resume the truncate when the filesystem recovers. It also 3743 + * marks the inode dirty, to catch the new size. 3744 + * 3745 + * Implication: the file must always be in a sane, consistent 3746 + * truncatable state while each transaction commits. 3747 + */ 3748 + if (ext4_orphan_add(handle, inode)) 3749 + goto out_stop; 3750 + 3751 + down_write(&EXT4_I(inode)->i_data_sem); 3752 + 3753 + ext4_discard_preallocations(inode); 3754 + 3755 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3756 + ext4_ext_truncate(handle, inode); 3757 + else 3758 + ext4_ind_truncate(handle, inode); 3759 + 3760 + up_write(&ei->i_data_sem); 3761 + 3762 + if (IS_SYNC(inode)) 3763 + ext4_handle_sync(handle); 3764 + 3765 + out_stop: 3766 + /* 3767 + * If this was a simple ftruncate() and the file will remain alive, 3768 + * then we need to clear up the orphan record which we created above. 3769 + * However, if this was a real unlink then we were called by 3770 + * ext4_delete_inode(), and we allow that function to clean up the 3771 + * orphan info for us. 3772 + */ 3773 + if (inode->i_nlink) 3774 + ext4_orphan_del(handle, inode); 3775 + 3776 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3777 + ext4_mark_inode_dirty(handle, inode); 3778 + ext4_journal_stop(handle); 3845 3779 3846 3780 trace_ext4_truncate_exit(inode); 3847 3781 } ··· 4011 3821 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4012 3822 ext4_fsblk_t b, end, table; 4013 3823 unsigned num; 3824 + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; 4014 3825 4015 3826 table = ext4_inode_table(sb, gdp); 4016 3827 /* s_inode_readahead_blks is always a power of 2 */ 4017 - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 3828 + b = block & ~((ext4_fsblk_t) ra_blks - 1); 4018 3829 if (table > b) 4019 3830 b = table; 4020 - end = b + EXT4_SB(sb)->s_inode_readahead_blks; 3831 + end = b + ra_blks; 4021 3832 num = EXT4_INODES_PER_GROUP(sb); 4022 3833 if (ext4_has_group_desc_csum(sb)) 4023 3834 num -= ext4_itable_unused_count(sb, gdp); ··· 4215 4024 * NeilBrown 1999oct15 4216 4025 */ 4217 4026 if (inode->i_nlink == 0) { 4218 - if (inode->i_mode == 0 || 4219 - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4027 + if ((inode->i_mode == 0 || 4028 + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 4029 + ino != EXT4_BOOT_LOADER_INO) { 4220 4030 /* this inode is deleted */ 4221 4031 ret = -ESTALE; 4222 4032 goto bad_inode; ··· 4225 4033 /* The only unlinked inodes we let through here have 4226 4034 * valid i_mode and are being read by the orphan 4227 4035 * recovery code: that's fine, we're about to complete 4228 - * the process of deleting those. */ 4036 + * the process of deleting those. 4037 + * OR it is the EXT4_BOOT_LOADER_INO which is 4038 + * not initialized on a new filesystem. */ 4229 4039 } 4230 4040 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4231 4041 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ··· 4347 4153 else 4348 4154 init_special_inode(inode, inode->i_mode, 4349 4155 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4156 + } else if (ino == EXT4_BOOT_LOADER_INO) { 4157 + make_bad_inode(inode); 4350 4158 } else { 4351 4159 ret = -EIO; 4352 4160 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);

+205 -13

fs/ext4/ioctl.c

··· 17 17 #include <asm/uaccess.h> 18 18 #include "ext4_jbd2.h" 19 19 #include "ext4.h" 20 + #include "ext4_extents.h" 20 21 21 22 #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) 23 + 24 + /** 25 + * Swap memory between @a and @b for @len bytes. 26 + * 27 + * @a: pointer to first memory area 28 + * @b: pointer to second memory area 29 + * @len: number of bytes to swap 30 + * 31 + */ 32 + static void memswap(void *a, void *b, size_t len) 33 + { 34 + unsigned char *ap, *bp; 35 + unsigned char tmp; 36 + 37 + ap = (unsigned char *)a; 38 + bp = (unsigned char *)b; 39 + while (len-- > 0) { 40 + tmp = *ap; 41 + *ap = *bp; 42 + *bp = tmp; 43 + ap++; 44 + bp++; 45 + } 46 + } 47 + 48 + /** 49 + * Swap i_data and associated attributes between @inode1 and @inode2. 50 + * This function is used for the primary swap between inode1 and inode2 51 + * and also to revert this primary swap in case of errors. 52 + * 53 + * Therefore you have to make sure, that calling this method twice 54 + * will revert all changes. 55 + * 56 + * @inode1: pointer to first inode 57 + * @inode2: pointer to second inode 58 + */ 59 + static void swap_inode_data(struct inode *inode1, struct inode *inode2) 60 + { 61 + loff_t isize; 62 + struct ext4_inode_info *ei1; 63 + struct ext4_inode_info *ei2; 64 + 65 + ei1 = EXT4_I(inode1); 66 + ei2 = EXT4_I(inode2); 67 + 68 + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); 69 + memswap(&inode1->i_version, &inode2->i_version, 70 + sizeof(inode1->i_version)); 71 + memswap(&inode1->i_blocks, &inode2->i_blocks, 72 + sizeof(inode1->i_blocks)); 73 + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); 74 + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); 75 + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); 76 + 77 + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); 78 + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); 79 + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); 80 + memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); 81 + memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); 82 + 83 + isize = i_size_read(inode1); 84 + i_size_write(inode1, i_size_read(inode2)); 85 + i_size_write(inode2, isize); 86 + } 87 + 88 + /** 89 + * Swap the information from the given @inode and the inode 90 + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other 91 + * important fields of the inodes. 92 + * 93 + * @sb: the super block of the filesystem 94 + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO 95 + * 96 + */ 97 + static long swap_inode_boot_loader(struct super_block *sb, 98 + struct inode *inode) 99 + { 100 + handle_t *handle; 101 + int err; 102 + struct inode *inode_bl; 103 + struct ext4_inode_info *ei; 104 + struct ext4_inode_info *ei_bl; 105 + struct ext4_sb_info *sbi; 106 + 107 + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 108 + err = -EINVAL; 109 + goto swap_boot_out; 110 + } 111 + 112 + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { 113 + err = -EPERM; 114 + goto swap_boot_out; 115 + } 116 + 117 + sbi = EXT4_SB(sb); 118 + ei = EXT4_I(inode); 119 + 120 + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 121 + if (IS_ERR(inode_bl)) { 122 + err = PTR_ERR(inode_bl); 123 + goto swap_boot_out; 124 + } 125 + ei_bl = EXT4_I(inode_bl); 126 + 127 + filemap_flush(inode->i_mapping); 128 + filemap_flush(inode_bl->i_mapping); 129 + 130 + /* Protect orig inodes against a truncate and make sure, 131 + * that only 1 swap_inode_boot_loader is running. */ 132 + ext4_inode_double_lock(inode, inode_bl); 133 + 134 + truncate_inode_pages(&inode->i_data, 0); 135 + truncate_inode_pages(&inode_bl->i_data, 0); 136 + 137 + /* Wait for all existing dio workers */ 138 + ext4_inode_block_unlocked_dio(inode); 139 + ext4_inode_block_unlocked_dio(inode_bl); 140 + inode_dio_wait(inode); 141 + inode_dio_wait(inode_bl); 142 + 143 + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); 144 + if (IS_ERR(handle)) { 145 + err = -EINVAL; 146 + goto swap_boot_out; 147 + } 148 + 149 + /* Protect extent tree against block allocations via delalloc */ 150 + ext4_double_down_write_data_sem(inode, inode_bl); 151 + 152 + if (inode_bl->i_nlink == 0) { 153 + /* this inode has never been used as a BOOT_LOADER */ 154 + set_nlink(inode_bl, 1); 155 + i_uid_write(inode_bl, 0); 156 + i_gid_write(inode_bl, 0); 157 + inode_bl->i_flags = 0; 158 + ei_bl->i_flags = 0; 159 + inode_bl->i_version = 1; 160 + i_size_write(inode_bl, 0); 161 + inode_bl->i_mode = S_IFREG; 162 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, 163 + EXT4_FEATURE_INCOMPAT_EXTENTS)) { 164 + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); 165 + ext4_ext_tree_init(handle, inode_bl); 166 + } else 167 + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); 168 + } 169 + 170 + swap_inode_data(inode, inode_bl); 171 + 172 + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); 173 + 174 + spin_lock(&sbi->s_next_gen_lock); 175 + inode->i_generation = sbi->s_next_generation++; 176 + inode_bl->i_generation = sbi->s_next_generation++; 177 + spin_unlock(&sbi->s_next_gen_lock); 178 + 179 + ext4_discard_preallocations(inode); 180 + 181 + err = ext4_mark_inode_dirty(handle, inode); 182 + if (err < 0) { 183 + ext4_warning(inode->i_sb, 184 + "couldn't mark inode #%lu dirty (err %d)", 185 + inode->i_ino, err); 186 + /* Revert all changes: */ 187 + swap_inode_data(inode, inode_bl); 188 + } else { 189 + err = ext4_mark_inode_dirty(handle, inode_bl); 190 + if (err < 0) { 191 + ext4_warning(inode_bl->i_sb, 192 + "couldn't mark inode #%lu dirty (err %d)", 193 + inode_bl->i_ino, err); 194 + /* Revert all changes: */ 195 + swap_inode_data(inode, inode_bl); 196 + ext4_mark_inode_dirty(handle, inode); 197 + } 198 + } 199 + 200 + ext4_journal_stop(handle); 201 + 202 + ext4_double_up_write_data_sem(inode, inode_bl); 203 + 204 + ext4_inode_resume_unlocked_dio(inode); 205 + ext4_inode_resume_unlocked_dio(inode_bl); 206 + 207 + ext4_inode_double_unlock(inode, inode_bl); 208 + 209 + iput(inode_bl); 210 + 211 + swap_boot_out: 212 + return err; 213 + } 22 214 23 215 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 24 216 { ··· 275 83 if (!capable(CAP_SYS_RESOURCE)) 276 84 goto flags_out; 277 85 } 278 - if (oldflags & EXT4_EXTENTS_FL) { 279 - /* We don't support clearning extent flags */ 280 - if (!(flags & EXT4_EXTENTS_FL)) { 281 - err = -EOPNOTSUPP; 282 - goto flags_out; 283 - } 284 - } else if (flags & EXT4_EXTENTS_FL) { 285 - /* migrate the file */ 86 + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) 286 87 migrate = 1; 287 - flags &= ~EXT4_EXTENTS_FL; 288 - } 289 88 290 89 if (flags & EXT4_EOFBLOCKS_FL) { 291 90 /* we don't support adding EOFBLOCKS flag */ ··· 320 137 err = ext4_change_inode_journal_flag(inode, jflag); 321 138 if (err) 322 139 goto flags_out; 323 - if (migrate) 324 - err = ext4_ext_migrate(inode); 140 + if (migrate) { 141 + if (flags & EXT4_EXTENTS_FL) 142 + err = ext4_ext_migrate(inode); 143 + else 144 + err = ext4_ind_migrate(inode); 145 + } 146 + 325 147 flags_out: 326 148 mutex_unlock(&inode->i_mutex); 327 149 mnt_drop_write_file(filp); ··· 545 357 return err; 546 358 } 547 359 360 + case EXT4_IOC_SWAP_BOOT: 361 + if (!(filp->f_mode & FMODE_WRITE)) 362 + return -EBADF; 363 + return swap_inode_boot_loader(sb, inode); 364 + 548 365 case EXT4_IOC_RESIZE_FS: { 549 366 ext4_fsblk_t n_blocks_count; 550 - struct super_block *sb = inode->i_sb; 551 367 int err = 0, err2 = 0; 552 368 ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; 553 369

+186 -73

fs/ext4/mballoc.c

··· 405 405 ext4_clear_bit(bit, addr); 406 406 } 407 407 408 + static inline int mb_test_and_clear_bit(int bit, void *addr) 409 + { 410 + addr = mb_correct_addr_and_bit(&bit, addr); 411 + return ext4_test_and_clear_bit(bit, addr); 412 + } 413 + 408 414 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 409 415 { 410 416 int fix = 0, ret, tmpmax; ··· 770 764 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 771 765 } 772 766 767 + static void mb_regenerate_buddy(struct ext4_buddy *e4b) 768 + { 769 + int count; 770 + int order = 1; 771 + void *buddy; 772 + 773 + while ((buddy = mb_find_buddy(e4b, order++, &count))) { 774 + ext4_set_bits(buddy, 0, count); 775 + } 776 + e4b->bd_info->bb_fragments = 0; 777 + memset(e4b->bd_info->bb_counters, 0, 778 + sizeof(*e4b->bd_info->bb_counters) * 779 + (e4b->bd_sb->s_blocksize_bits + 2)); 780 + 781 + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, 782 + e4b->bd_bitmap, e4b->bd_group); 783 + } 784 + 773 785 /* The buddy information is attached the buddy cache inode 774 786 * for convenience. The information regarding each group 775 787 * is loaded via ext4_mb_load_buddy. The information involve ··· 884 860 885 861 first_block = page->index * blocks_per_page; 886 862 for (i = 0; i < blocks_per_page; i++) { 887 - int group; 888 - 889 863 group = (first_block + i) >> 1; 890 864 if (group >= ngroups) 891 865 break; ··· 1033 1011 struct page *page; 1034 1012 int ret = 0; 1035 1013 1014 + might_sleep(); 1036 1015 mb_debug(1, "init group %u\n", group); 1037 1016 this_grp = ext4_get_group_info(sb, group); 1038 1017 /* ··· 1105 1082 struct ext4_sb_info *sbi = EXT4_SB(sb); 1106 1083 struct inode *inode = sbi->s_buddy_cache; 1107 1084 1085 + might_sleep(); 1108 1086 mb_debug(1, "load group %u\n", group); 1109 1087 1110 1088 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ··· 1268 1244 } 1269 1245 } 1270 1246 1247 + /* clear bits in given range 1248 + * will return first found zero bit if any, -1 otherwise 1249 + */ 1250 + static int mb_test_and_clear_bits(void *bm, int cur, int len) 1251 + { 1252 + __u32 *addr; 1253 + int zero_bit = -1; 1254 + 1255 + len = cur + len; 1256 + while (cur < len) { 1257 + if ((cur & 31) == 0 && (len - cur) >= 32) { 1258 + /* fast path: clear whole word at once */ 1259 + addr = bm + (cur >> 3); 1260 + if (*addr != (__u32)(-1) && zero_bit == -1) 1261 + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1262 + *addr = 0; 1263 + cur += 32; 1264 + continue; 1265 + } 1266 + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1267 + zero_bit = cur; 1268 + cur++; 1269 + } 1270 + 1271 + return zero_bit; 1272 + } 1273 + 1271 1274 void ext4_set_bits(void *bm, int cur, int len) 1272 1275 { 1273 1276 __u32 *addr; ··· 1313 1262 } 1314 1263 } 1315 1264 1316 - static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1317 - int first, int count) 1265 + /* 1266 + * _________________________________________________________________ */ 1267 + 1268 + static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1318 1269 { 1319 - int block = 0; 1320 - int max = 0; 1321 - int order; 1322 - void *buddy; 1323 - void *buddy2; 1270 + if (mb_test_bit(*bit + side, bitmap)) { 1271 + mb_clear_bit(*bit, bitmap); 1272 + (*bit) -= side; 1273 + return 1; 1274 + } 1275 + else { 1276 + (*bit) += side; 1277 + mb_set_bit(*bit, bitmap); 1278 + return -1; 1279 + } 1280 + } 1281 + 1282 + static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1283 + { 1284 + int max; 1285 + int order = 1; 1286 + void *buddy = mb_find_buddy(e4b, order, &max); 1287 + 1288 + while (buddy) { 1289 + void *buddy2; 1290 + 1291 + /* Bits in range [first; last] are known to be set since 1292 + * corresponding blocks were allocated. Bits in range 1293 + * (first; last) will stay set because they form buddies on 1294 + * upper layer. We just deal with borders if they don't 1295 + * align with upper layer and then go up. 1296 + * Releasing entire group is all about clearing 1297 + * single bit of highest order buddy. 1298 + */ 1299 + 1300 + /* Example: 1301 + * --------------------------------- 1302 + * | 1 | 1 | 1 | 1 | 1303 + * --------------------------------- 1304 + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1305 + * --------------------------------- 1306 + * 0 1 2 3 4 5 6 7 1307 + * \_____________________/ 1308 + * 1309 + * Neither [1] nor [6] is aligned to above layer. 1310 + * Left neighbour [0] is free, so mark it busy, 1311 + * decrease bb_counters and extend range to 1312 + * [0; 6] 1313 + * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1314 + * mark [6] free, increase bb_counters and shrink range to 1315 + * [0; 5]. 1316 + * Then shift range to [0; 2], go up and do the same. 1317 + */ 1318 + 1319 + 1320 + if (first & 1) 1321 + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1322 + if (!(last & 1)) 1323 + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1324 + if (first > last) 1325 + break; 1326 + order++; 1327 + 1328 + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { 1329 + mb_clear_bits(buddy, first, last - first + 1); 1330 + e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1331 + break; 1332 + } 1333 + first >>= 1; 1334 + last >>= 1; 1335 + buddy = buddy2; 1336 + } 1337 + } 1338 + 1339 + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1340 + int first, int count) 1341 + { 1342 + int left_is_free = 0; 1343 + int right_is_free = 0; 1344 + int block; 1345 + int last = first + count - 1; 1324 1346 struct super_block *sb = e4b->bd_sb; 1325 1347 1326 - BUG_ON(first + count > (sb->s_blocksize << 3)); 1348 + BUG_ON(last >= (sb->s_blocksize << 3)); 1327 1349 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1328 1350 mb_check_buddy(e4b); 1329 1351 mb_free_blocks_double(inode, e4b, first, count); ··· 1405 1281 if (first < e4b->bd_info->bb_first_free) 1406 1282 e4b->bd_info->bb_first_free = first; 1407 1283 1408 - /* let's maintain fragments counter */ 1284 + /* access memory sequentially: check left neighbour, 1285 + * clear range and then check right neighbour 1286 + */ 1409 1287 if (first != 0) 1410 - block = !mb_test_bit(first - 1, e4b->bd_bitmap); 1411 - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1412 - max = !mb_test_bit(first + count, e4b->bd_bitmap); 1413 - if (block && max) 1288 + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1289 + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1290 + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1291 + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1292 + 1293 + if (unlikely(block != -1)) { 1294 + ext4_fsblk_t blocknr; 1295 + 1296 + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1297 + blocknr += EXT4_C2B(EXT4_SB(sb), block); 1298 + ext4_grp_locked_error(sb, e4b->bd_group, 1299 + inode ? inode->i_ino : 0, 1300 + blocknr, 1301 + "freeing already freed block " 1302 + "(bit %u)", block); 1303 + mb_regenerate_buddy(e4b); 1304 + goto done; 1305 + } 1306 + 1307 + /* let's maintain fragments counter */ 1308 + if (left_is_free && right_is_free) 1414 1309 e4b->bd_info->bb_fragments--; 1415 - else if (!block && !max) 1310 + else if (!left_is_free && !right_is_free) 1416 1311 e4b->bd_info->bb_fragments++; 1417 1312 1418 - /* let's maintain buddy itself */ 1419 - while (count-- > 0) { 1420 - block = first++; 1421 - order = 0; 1422 - 1423 - if (!mb_test_bit(block, e4b->bd_bitmap)) { 1424 - ext4_fsblk_t blocknr; 1425 - 1426 - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1427 - blocknr += EXT4_C2B(EXT4_SB(sb), block); 1428 - ext4_grp_locked_error(sb, e4b->bd_group, 1429 - inode ? inode->i_ino : 0, 1430 - blocknr, 1431 - "freeing already freed block " 1432 - "(bit %u)", block); 1433 - } 1434 - mb_clear_bit(block, e4b->bd_bitmap); 1435 - e4b->bd_info->bb_counters[order]++; 1436 - 1437 - /* start of the buddy */ 1438 - buddy = mb_find_buddy(e4b, order, &max); 1439 - 1440 - do { 1441 - block &= ~1UL; 1442 - if (mb_test_bit(block, buddy) || 1443 - mb_test_bit(block + 1, buddy)) 1444 - break; 1445 - 1446 - /* both the buddies are free, try to coalesce them */ 1447 - buddy2 = mb_find_buddy(e4b, order + 1, &max); 1448 - 1449 - if (!buddy2) 1450 - break; 1451 - 1452 - if (order > 0) { 1453 - /* for special purposes, we don't set 1454 - * free bits in bitmap */ 1455 - mb_set_bit(block, buddy); 1456 - mb_set_bit(block + 1, buddy); 1457 - } 1458 - e4b->bd_info->bb_counters[order]--; 1459 - e4b->bd_info->bb_counters[order]--; 1460 - 1461 - block = block >> 1; 1462 - order++; 1463 - e4b->bd_info->bb_counters[order]++; 1464 - 1465 - mb_clear_bit(block, buddy2); 1466 - buddy = buddy2; 1467 - } while (1); 1313 + /* buddy[0] == bd_bitmap is a special case, so handle 1314 + * it right away and let mb_buddy_mark_free stay free of 1315 + * zero order checks. 1316 + * Check if neighbours are to be coaleasced, 1317 + * adjust bitmap bb_counters and borders appropriately. 1318 + */ 1319 + if (first & 1) { 1320 + first += !left_is_free; 1321 + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1468 1322 } 1323 + if (!(last & 1)) { 1324 + last -= !right_is_free; 1325 + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1326 + } 1327 + 1328 + if (first <= last) 1329 + mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1330 + 1331 + done: 1469 1332 mb_set_largest_free_order(sb, e4b->bd_info); 1470 1333 mb_check_buddy(e4b); 1471 1334 } ··· 3453 3342 if (pa->pa_type == MB_GROUP_PA) 3454 3343 grp_blk--; 3455 3344 3456 - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3345 + grp = ext4_get_group_number(sb, grp_blk); 3457 3346 3458 3347 /* 3459 3348 * possible race: ··· 3918 3807 3919 3808 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3920 3809 BUG_ON(pa->pa_type != MB_INODE_PA); 3921 - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3810 + group = ext4_get_group_number(sb, pa->pa_pstart); 3922 3811 3923 3812 err = ext4_mb_load_buddy(sb, group, &e4b); 3924 3813 if (err) { ··· 4180 4069 4181 4070 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4182 4071 4183 - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4072 + group = ext4_get_group_number(sb, pa->pa_pstart); 4184 4073 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4185 4074 ext4_error(sb, "Error loading buddy information for %u", 4186 4075 group); ··· 4328 4217 unsigned int inquota = 0; 4329 4218 unsigned int reserv_clstrs = 0; 4330 4219 4220 + might_sleep(); 4331 4221 sb = ar->inode->i_sb; 4332 4222 sbi = EXT4_SB(sb); 4333 4223 ··· 4532 4420 node = rb_prev(new_node); 4533 4421 if (node) { 4534 4422 entry = rb_entry(node, struct ext4_free_data, efd_node); 4535 - if (can_merge(entry, new_entry)) { 4423 + if (can_merge(entry, new_entry) && 4424 + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { 4536 4425 new_entry->efd_start_cluster = entry->efd_start_cluster; 4537 4426 new_entry->efd_count += entry->efd_count; 4538 4427 rb_erase(node, &(db->bb_free_root)); 4539 - ext4_journal_callback_del(handle, &entry->efd_jce); 4540 4428 kmem_cache_free(ext4_free_data_cachep, entry); 4541 4429 } 4542 4430 } ··· 4544 4432 node = rb_next(new_node); 4545 4433 if (node) { 4546 4434 entry = rb_entry(node, struct ext4_free_data, efd_node); 4547 - if (can_merge(new_entry, entry)) { 4435 + if (can_merge(new_entry, entry) && 4436 + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { 4548 4437 new_entry->efd_count += entry->efd_count; 4549 4438 rb_erase(node, &(db->bb_free_root)); 4550 - ext4_journal_callback_del(handle, &entry->efd_jce); 4551 4439 kmem_cache_free(ext4_free_data_cachep, entry); 4552 4440 } 4553 4441 } ··· 4582 4470 int err = 0; 4583 4471 int ret; 4584 4472 4473 + might_sleep(); 4585 4474 if (bh) { 4586 4475 if (block) 4587 4476 BUG_ON(block != bh->b_blocknr);

+61 -1

fs/ext4/migrate.c

··· 426 426 return retval; 427 427 } 428 428 return retval; 429 - 430 429 } 431 430 432 431 int ext4_ext_migrate(struct inode *inode) ··· 604 605 iput(tmp_inode); 605 606 606 607 return retval; 608 + } 609 + 610 + /* 611 + * Migrate a simple extent-based inode to use the i_blocks[] array 612 + */ 613 + int ext4_ind_migrate(struct inode *inode) 614 + { 615 + struct ext4_extent_header *eh; 616 + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 617 + struct ext4_inode_info *ei = EXT4_I(inode); 618 + struct ext4_extent *ex; 619 + unsigned int i, len; 620 + ext4_fsblk_t blk; 621 + handle_t *handle; 622 + int ret; 623 + 624 + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 625 + EXT4_FEATURE_INCOMPAT_EXTENTS) || 626 + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 627 + return -EINVAL; 628 + 629 + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 630 + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) 631 + return -EOPNOTSUPP; 632 + 633 + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); 634 + if (IS_ERR(handle)) 635 + return PTR_ERR(handle); 636 + 637 + down_write(&EXT4_I(inode)->i_data_sem); 638 + ret = ext4_ext_check_inode(inode); 639 + if (ret) 640 + goto errout; 641 + 642 + eh = ext_inode_hdr(inode); 643 + ex = EXT_FIRST_EXTENT(eh); 644 + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || 645 + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { 646 + ret = -EOPNOTSUPP; 647 + goto errout; 648 + } 649 + if (eh->eh_entries == 0) 650 + blk = len = 0; 651 + else { 652 + len = le16_to_cpu(ex->ee_len); 653 + blk = ext4_ext_pblock(ex); 654 + if (len > EXT4_NDIR_BLOCKS) { 655 + ret = -EOPNOTSUPP; 656 + goto errout; 657 + } 658 + } 659 + 660 + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); 661 + memset(ei->i_data, 0, sizeof(ei->i_data)); 662 + for (i=0; i < len; i++) 663 + ei->i_data[i] = cpu_to_le32(blk++); 664 + ext4_mark_inode_dirty(handle, inode); 665 + errout: 666 + ext4_journal_stop(handle); 667 + up_write(&EXT4_I(inode)->i_data_sem); 668 + return ret; 607 669 }

+3 -3

fs/ext4/mmp.c

··· 7 7 #include "ext4.h" 8 8 9 9 /* Checksumming functions */ 10 - static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) 10 + static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) 11 11 { 12 12 struct ext4_sb_info *sbi = EXT4_SB(sb); 13 13 int offset = offsetof(struct mmp_struct, mmp_checksum); ··· 54 54 lock_buffer(bh); 55 55 bh->b_end_io = end_buffer_write_sync; 56 56 get_bh(bh); 57 - submit_bh(WRITE_SYNC, bh); 57 + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); 58 58 wait_on_buffer(bh); 59 59 sb_end_write(sb); 60 60 if (unlikely(!buffer_uptodate(bh))) ··· 86 86 get_bh(*bh); 87 87 lock_buffer(*bh); 88 88 (*bh)->b_end_io = end_buffer_read_sync; 89 - submit_bh(READ_SYNC, *bh); 89 + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); 90 90 wait_on_buffer(*bh); 91 91 if (!buffer_uptodate(*bh)) { 92 92 brelse(*bh);

+35 -38

fs/ext4/move_extent.c

··· 144 144 } 145 145 146 146 /** 147 - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 147 + * ext4_double_down_write_data_sem - Acquire two inodes' write lock 148 + * of i_data_sem 148 149 * 149 150 * Acquire write lock of i_data_sem of the two inodes 150 151 */ 151 - static void 152 - double_down_write_data_sem(struct inode *first, struct inode *second) 152 + void 153 + ext4_double_down_write_data_sem(struct inode *first, struct inode *second) 153 154 { 154 155 if (first < second) { 155 156 down_write(&EXT4_I(first)->i_data_sem); ··· 163 162 } 164 163 165 164 /** 166 - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem 165 + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem 167 166 * 168 167 * @orig_inode: original inode structure to be released its lock first 169 168 * @donor_inode: donor inode structure to be released its lock second 170 169 * Release write lock of i_data_sem of two inodes (orig and donor). 171 170 */ 172 - static void 173 - double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 171 + void 172 + ext4_double_up_write_data_sem(struct inode *orig_inode, 173 + struct inode *donor_inode) 174 174 { 175 175 up_write(&EXT4_I(orig_inode)->i_data_sem); 176 176 up_write(&EXT4_I(donor_inode)->i_data_sem); ··· 409 407 mext_insert_inside_block(o_start, o_end, start_ext, new_ext, 410 408 end_ext, eh, range_to_move); 411 409 412 - if (depth) { 413 - ret = ext4_handle_dirty_metadata(handle, orig_inode, 414 - orig_path->p_bh); 415 - if (ret) 416 - return ret; 417 - } else { 418 - ret = ext4_mark_inode_dirty(handle, orig_inode); 419 - if (ret < 0) 420 - return ret; 421 - } 422 - 423 - return 0; 410 + return ext4_ext_dirty(handle, orig_inode, orig_path); 424 411 } 425 412 426 413 /** ··· 728 737 donor_off += dext_alen; 729 738 orig_off += dext_alen; 730 739 740 + BUG_ON(replaced_count > count); 731 741 /* Already moved the expected blocks */ 732 742 if (replaced_count >= count) 733 743 break; ··· 806 814 page_cache_release(page[0]); 807 815 return -ENOMEM; 808 816 } 809 - 817 + /* 818 + * grab_cache_page_write_begin() may not wait on page's writeback if 819 + * BDI not demand that. But it is reasonable to be very conservative 820 + * here and explicitly wait on page's writeback 821 + */ 822 + wait_on_page_writeback(page[0]); 823 + wait_on_page_writeback(page[1]); 810 824 if (inode1 > inode2) { 811 825 struct page *tmp; 812 826 tmp = page[0]; ··· 854 856 if (buffer_uptodate(bh)) 855 857 continue; 856 858 if (!buffer_mapped(bh)) { 857 - int err = 0; 858 859 err = ext4_get_block(inode, block, bh, 0); 859 860 if (err) { 860 861 SetPageError(page); ··· 973 976 * necessary, just swap data blocks between orig and donor. 974 977 */ 975 978 if (uninit) { 976 - double_down_write_data_sem(orig_inode, donor_inode); 979 + ext4_double_down_write_data_sem(orig_inode, donor_inode); 977 980 /* If any of extents in range became initialized we have to 978 981 * fallback to data copying */ 979 982 uninit = mext_check_coverage(orig_inode, orig_blk_offset, ··· 987 990 goto drop_data_sem; 988 991 989 992 if (!uninit) { 990 - double_up_write_data_sem(orig_inode, donor_inode); 993 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 991 994 goto data_copy; 992 995 } 993 996 if ((page_has_private(pagep[0]) && ··· 1001 1004 donor_inode, orig_blk_offset, 1002 1005 block_len_in_page, err); 1003 1006 drop_data_sem: 1004 - double_up_write_data_sem(orig_inode, donor_inode); 1007 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 1005 1008 goto unlock_pages; 1006 1009 } 1007 1010 data_copy: ··· 1030 1033 } 1031 1034 /* Perform all necessary steps similar write_begin()/write_end() 1032 1035 * but keeping in mind that i_size will not change */ 1033 - *err = __block_write_begin(pagep[0], from, from + replaced_size, 1036 + *err = __block_write_begin(pagep[0], from, replaced_size, 1034 1037 ext4_get_block); 1035 1038 if (!*err) 1036 1039 *err = block_commit_write(pagep[0], from, from + replaced_size); ··· 1062 1065 * Extents are swapped already, but we are not able to copy data. 1063 1066 * Try to swap extents to it's original places 1064 1067 */ 1065 - double_down_write_data_sem(orig_inode, donor_inode); 1068 + ext4_double_down_write_data_sem(orig_inode, donor_inode); 1066 1069 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 1067 1070 orig_blk_offset, 1068 1071 block_len_in_page, &err2); 1069 - double_up_write_data_sem(orig_inode, donor_inode); 1072 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 1070 1073 if (replaced_count != block_len_in_page) { 1071 1074 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), 1072 1075 "Unable to copy data block," ··· 1206 1209 } 1207 1210 1208 1211 /** 1209 - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 1212 + * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 1210 1213 * 1211 1214 * @inode1: the inode structure 1212 1215 * @inode2: the inode structure 1213 1216 * 1214 1217 * Lock two inodes' i_mutex 1215 1218 */ 1216 - static void 1217 - mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1219 + void 1220 + ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) 1218 1221 { 1219 1222 BUG_ON(inode1 == inode2); 1220 1223 if (inode1 < inode2) { ··· 1227 1230 } 1228 1231 1229 1232 /** 1230 - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 1233 + * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 1231 1234 * 1232 1235 * @inode1: the inode that is released first 1233 1236 * @inode2: the inode that is released second 1234 1237 * 1235 1238 */ 1236 1239 1237 - static void 1238 - mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1240 + void 1241 + ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1239 1242 { 1240 1243 mutex_unlock(&inode1->i_mutex); 1241 1244 mutex_unlock(&inode2->i_mutex); ··· 1330 1333 return -EINVAL; 1331 1334 } 1332 1335 /* Protect orig and donor inodes against a truncate */ 1333 - mext_inode_double_lock(orig_inode, donor_inode); 1336 + ext4_inode_double_lock(orig_inode, donor_inode); 1334 1337 1335 1338 /* Wait for all existing dio workers */ 1336 1339 ext4_inode_block_unlocked_dio(orig_inode); ··· 1339 1342 inode_dio_wait(donor_inode); 1340 1343 1341 1344 /* Protect extent tree against block allocations via delalloc */ 1342 - double_down_write_data_sem(orig_inode, donor_inode); 1345 + ext4_double_down_write_data_sem(orig_inode, donor_inode); 1343 1346 /* Check the filesystem environment whether move_extent can be done */ 1344 1347 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1345 1348 donor_start, &len); ··· 1463 1466 * b. racing with ->readpage, ->write_begin, and ext4_get_block 1464 1467 * in move_extent_per_page 1465 1468 */ 1466 - double_up_write_data_sem(orig_inode, donor_inode); 1469 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 1467 1470 1468 1471 while (orig_page_offset <= seq_end_page) { 1469 1472 ··· 1497 1500 block_len_in_page = rest_blocks; 1498 1501 } 1499 1502 1500 - double_down_write_data_sem(orig_inode, donor_inode); 1503 + ext4_double_down_write_data_sem(orig_inode, donor_inode); 1501 1504 if (ret < 0) 1502 1505 break; 1503 1506 ··· 1535 1538 ext4_ext_drop_refs(holecheck_path); 1536 1539 kfree(holecheck_path); 1537 1540 } 1538 - double_up_write_data_sem(orig_inode, donor_inode); 1541 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 1539 1542 ext4_inode_resume_unlocked_dio(orig_inode); 1540 1543 ext4_inode_resume_unlocked_dio(donor_inode); 1541 - mext_inode_double_unlock(orig_inode, donor_inode); 1544 + ext4_inode_double_unlock(orig_inode, donor_inode); 1542 1545 1543 1546 return ret; 1544 1547 }

+19 -29

fs/ext4/namei.c

··· 416 416 { 417 417 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 418 418 struct ext4_inode_info *ei = EXT4_I(inode); 419 - __u32 csum, old_csum; 419 + __u32 csum; 420 + __le32 save_csum; 420 421 int size; 421 422 422 423 size = count_offset + (count * sizeof(struct dx_entry)); 423 - old_csum = t->dt_checksum; 424 + save_csum = t->dt_checksum; 424 425 t->dt_checksum = 0; 425 426 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); 426 427 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); 427 - t->dt_checksum = old_csum; 428 + t->dt_checksum = save_csum; 428 429 429 430 return cpu_to_le32(csum); 430 431 } ··· 972 971 hinfo.hash_version += 973 972 EXT4_SB(dir->i_sb)->s_hash_unsigned; 974 973 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 974 + if (ext4_has_inline_data(dir)) { 975 + int has_inline_data = 1; 976 + count = htree_inlinedir_to_tree(dir_file, dir, 0, 977 + &hinfo, start_hash, 978 + start_minor_hash, 979 + &has_inline_data); 980 + if (has_inline_data) { 981 + *next_hash = ~0; 982 + return count; 983 + } 984 + } 975 985 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 976 986 start_hash, start_minor_hash); 977 987 *next_hash = ~0; ··· 1465 1453 } 1466 1454 1467 1455 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); 1468 - } 1469 - 1470 - #define S_SHIFT 12 1471 - static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { 1472 - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, 1473 - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, 1474 - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, 1475 - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, 1476 - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, 1477 - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, 1478 - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, 1479 - }; 1480 - 1481 - static inline void ext4_set_de_type(struct super_block *sb, 1482 - struct ext4_dir_entry_2 *de, 1483 - umode_t mode) { 1484 - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) 1485 - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1486 1456 } 1487 1457 1488 1458 /* ··· 2245 2251 dquot_initialize(dir); 2246 2252 2247 2253 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2248 - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2249 - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 2254 + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2250 2255 retry: 2251 2256 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2252 2257 NULL, EXT4_HT_DIR, credits); ··· 2279 2286 dquot_initialize(dir); 2280 2287 2281 2288 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2283 - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 2289 + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2284 2290 retry: 2285 2291 inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, 2286 2292 NULL, EXT4_HT_DIR, credits); ··· 2388 2396 dquot_initialize(dir); 2389 2397 2390 2398 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2391 - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2392 - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 2399 + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2393 2400 retry: 2394 2401 inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, 2395 2402 &dentry->d_name, ··· 2817 2826 * quota blocks, sb is already counted in previous macros). 2818 2827 */ 2819 2828 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2820 - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2821 - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); 2829 + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; 2822 2830 } 2823 2831 retry: 2824 2832 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,

+150 -130

fs/ext4/page-io.c

··· 29 29 #include "xattr.h" 30 30 #include "acl.h" 31 31 32 - static struct kmem_cache *io_page_cachep, *io_end_cachep; 32 + static struct kmem_cache *io_end_cachep; 33 33 34 34 int __init ext4_init_pageio(void) 35 35 { 36 - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 37 - if (io_page_cachep == NULL) 38 - return -ENOMEM; 39 36 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 40 - if (io_end_cachep == NULL) { 41 - kmem_cache_destroy(io_page_cachep); 37 + if (io_end_cachep == NULL) 42 38 return -ENOMEM; 43 - } 44 39 return 0; 45 40 } 46 41 47 42 void ext4_exit_pageio(void) 48 43 { 49 44 kmem_cache_destroy(io_end_cachep); 50 - kmem_cache_destroy(io_page_cachep); 51 45 } 52 46 53 47 /* ··· 61 67 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 62 68 } 63 69 64 - static void put_io_page(struct ext4_io_page *io_page) 70 + static void ext4_release_io_end(ext4_io_end_t *io_end) 65 71 { 66 - if (atomic_dec_and_test(&io_page->p_count)) { 67 - end_page_writeback(io_page->p_page); 68 - put_page(io_page->p_page); 69 - kmem_cache_free(io_page_cachep, io_page); 70 - } 72 + BUG_ON(!list_empty(&io_end->list)); 73 + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 74 + 75 + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) 76 + wake_up_all(ext4_ioend_wq(io_end->inode)); 77 + if (io_end->flag & EXT4_IO_END_DIRECT) 78 + inode_dio_done(io_end->inode); 79 + if (io_end->iocb) 80 + aio_complete(io_end->iocb, io_end->result, 0); 81 + kmem_cache_free(io_end_cachep, io_end); 71 82 } 72 83 73 - void ext4_free_io_end(ext4_io_end_t *io) 84 + static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) 74 85 { 75 - int i; 86 + struct inode *inode = io_end->inode; 76 87 77 - BUG_ON(!io); 78 - BUG_ON(!list_empty(&io->list)); 79 - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 80 - 81 - for (i = 0; i < io->num_io_pages; i++) 82 - put_io_page(io->pages[i]); 83 - io->num_io_pages = 0; 84 - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 85 - wake_up_all(ext4_ioend_wq(io->inode)); 86 - kmem_cache_free(io_end_cachep, io); 88 + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; 89 + /* Wake up anyone waiting on unwritten extent conversion */ 90 + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 91 + wake_up_all(ext4_ioend_wq(inode)); 87 92 } 88 93 89 94 /* check a range of space and convert unwritten extents to written. */ ··· 105 112 "(inode %lu, offset %llu, size %zd, error %d)", 106 113 inode->i_ino, offset, size, ret); 107 114 } 108 - /* Wake up anyone waiting on unwritten extent conversion */ 109 - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 110 - wake_up_all(ext4_ioend_wq(inode)); 111 - if (io->flag & EXT4_IO_END_DIRECT) 112 - inode_dio_done(inode); 113 - if (io->iocb) 114 - aio_complete(io->iocb, io->result, 0); 115 + ext4_clear_io_unwritten_flag(io); 116 + ext4_release_io_end(io); 115 117 return ret; 116 118 } 117 119 ··· 137 149 } 138 150 139 151 /* Add the io_end to per-inode completed end_io list. */ 140 - void ext4_add_complete_io(ext4_io_end_t *io_end) 152 + static void ext4_add_complete_io(ext4_io_end_t *io_end) 141 153 { 142 154 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 143 155 struct workqueue_struct *wq; ··· 174 186 err = ext4_end_io(io); 175 187 if (unlikely(!ret && err)) 176 188 ret = err; 177 - io->flag &= ~EXT4_IO_END_UNWRITTEN; 178 - ext4_free_io_end(io); 179 189 } 180 190 return ret; 181 191 } ··· 205 219 atomic_inc(&EXT4_I(inode)->i_ioend_count); 206 220 io->inode = inode; 207 221 INIT_LIST_HEAD(&io->list); 222 + atomic_set(&io->count, 1); 208 223 } 209 224 return io; 225 + } 226 + 227 + void ext4_put_io_end_defer(ext4_io_end_t *io_end) 228 + { 229 + if (atomic_dec_and_test(&io_end->count)) { 230 + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 231 + ext4_release_io_end(io_end); 232 + return; 233 + } 234 + ext4_add_complete_io(io_end); 235 + } 236 + } 237 + 238 + int ext4_put_io_end(ext4_io_end_t *io_end) 239 + { 240 + int err = 0; 241 + 242 + if (atomic_dec_and_test(&io_end->count)) { 243 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 244 + err = ext4_convert_unwritten_extents(io_end->inode, 245 + io_end->offset, io_end->size); 246 + ext4_clear_io_unwritten_flag(io_end); 247 + } 248 + ext4_release_io_end(io_end); 249 + } 250 + return err; 251 + } 252 + 253 + ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 254 + { 255 + atomic_inc(&io_end->count); 256 + return io_end; 210 257 } 211 258 212 259 /* ··· 262 243 ext4_io_end_t *io_end = bio->bi_private; 263 244 struct inode *inode; 264 245 int i; 246 + int blocksize; 265 247 sector_t bi_sector = bio->bi_sector; 266 248 267 249 BUG_ON(!io_end); 250 + inode = io_end->inode; 251 + blocksize = 1 << inode->i_blkbits; 268 252 bio->bi_private = NULL; 269 253 bio->bi_end_io = NULL; 270 254 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 271 255 error = 0; 272 - bio_put(bio); 273 - 274 - for (i = 0; i < io_end->num_io_pages; i++) { 275 - struct page *page = io_end->pages[i]->p_page; 256 + for (i = 0; i < bio->bi_vcnt; i++) { 257 + struct bio_vec *bvec = &bio->bi_io_vec[i]; 258 + struct page *page = bvec->bv_page; 276 259 struct buffer_head *bh, *head; 277 - loff_t offset; 278 - loff_t io_end_offset; 260 + unsigned bio_start = bvec->bv_offset; 261 + unsigned bio_end = bio_start + bvec->bv_len; 262 + unsigned under_io = 0; 263 + unsigned long flags; 264 + 265 + if (!page) 266 + continue; 279 267 280 268 if (error) { 281 269 SetPageError(page); 282 270 set_bit(AS_EIO, &page->mapping->flags); 283 - head = page_buffers(page); 284 - BUG_ON(!head); 285 - 286 - io_end_offset = io_end->offset + io_end->size; 287 - 288 - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; 289 - bh = head; 290 - do { 291 - if ((offset >= io_end->offset) && 292 - (offset+bh->b_size <= io_end_offset)) 293 - buffer_io_error(bh); 294 - 295 - offset += bh->b_size; 296 - bh = bh->b_this_page; 297 - } while (bh != head); 298 271 } 299 - 300 - put_io_page(io_end->pages[i]); 272 + bh = head = page_buffers(page); 273 + /* 274 + * We check all buffers in the page under BH_Uptodate_Lock 275 + * to avoid races with other end io clearing async_write flags 276 + */ 277 + local_irq_save(flags); 278 + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 279 + do { 280 + if (bh_offset(bh) < bio_start || 281 + bh_offset(bh) + blocksize > bio_end) { 282 + if (buffer_async_write(bh)) 283 + under_io++; 284 + continue; 285 + } 286 + clear_buffer_async_write(bh); 287 + if (error) 288 + buffer_io_error(bh); 289 + } while ((bh = bh->b_this_page) != head); 290 + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 291 + local_irq_restore(flags); 292 + if (!under_io) 293 + end_page_writeback(page); 301 294 } 302 - io_end->num_io_pages = 0; 303 - inode = io_end->inode; 295 + bio_put(bio); 304 296 305 297 if (error) { 306 298 io_end->flag |= EXT4_IO_END_ERROR; ··· 324 294 bi_sector >> (inode->i_blkbits - 9)); 325 295 } 326 296 327 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 328 - ext4_free_io_end(io_end); 329 - return; 330 - } 331 - 332 - ext4_add_complete_io(io_end); 297 + ext4_put_io_end_defer(io_end); 333 298 } 334 299 335 300 void ext4_io_submit(struct ext4_io_submit *io) ··· 338 313 bio_put(io->io_bio); 339 314 } 340 315 io->io_bio = NULL; 341 - io->io_op = 0; 316 + } 317 + 318 + void ext4_io_submit_init(struct ext4_io_submit *io, 319 + struct writeback_control *wbc) 320 + { 321 + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 322 + io->io_bio = NULL; 342 323 io->io_end = NULL; 343 324 } 344 325 345 - static int io_submit_init(struct ext4_io_submit *io, 346 - struct inode *inode, 347 - struct writeback_control *wbc, 348 - struct buffer_head *bh) 326 + static int io_submit_init_bio(struct ext4_io_submit *io, 327 + struct buffer_head *bh) 349 328 { 350 - ext4_io_end_t *io_end; 351 - struct page *page = bh->b_page; 352 329 int nvecs = bio_get_nr_vecs(bh->b_bdev); 353 330 struct bio *bio; 354 331 355 - io_end = ext4_init_io_end(inode, GFP_NOFS); 356 - if (!io_end) 357 - return -ENOMEM; 358 332 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 359 333 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 360 334 bio->bi_bdev = bh->b_bdev; 361 - bio->bi_private = io->io_end = io_end; 362 335 bio->bi_end_io = ext4_end_bio; 363 - 364 - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 365 - 336 + bio->bi_private = ext4_get_io_end(io->io_end); 337 + if (!io->io_end->size) 338 + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) 339 + + bh_offset(bh); 366 340 io->io_bio = bio; 367 - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 368 341 io->io_next_block = bh->b_blocknr; 369 342 return 0; 370 343 } 371 344 372 345 static int io_submit_add_bh(struct ext4_io_submit *io, 373 - struct ext4_io_page *io_page, 374 346 struct inode *inode, 375 - struct writeback_control *wbc, 376 347 struct buffer_head *bh) 377 348 { 378 349 ext4_io_end_t *io_end; 379 350 int ret; 380 - 381 - if (buffer_new(bh)) { 382 - clear_buffer_new(bh); 383 - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 384 - } 385 351 386 352 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 387 353 submit_and_retry: 388 354 ext4_io_submit(io); 389 355 } 390 356 if (io->io_bio == NULL) { 391 - ret = io_submit_init(io, inode, wbc, bh); 357 + ret = io_submit_init_bio(io, bh); 392 358 if (ret) 393 359 return ret; 394 360 } 395 - io_end = io->io_end; 396 - if ((io_end->num_io_pages >= MAX_IO_PAGES) && 397 - (io_end->pages[io_end->num_io_pages-1] != io_page)) 398 - goto submit_and_retry; 399 - if (buffer_uninit(bh)) 400 - ext4_set_io_unwritten_flag(inode, io_end); 401 - io->io_end->size += bh->b_size; 402 - io->io_next_block++; 403 361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 404 362 if (ret != bh->b_size) 405 363 goto submit_and_retry; 406 - if ((io_end->num_io_pages == 0) || 407 - (io_end->pages[io_end->num_io_pages-1] != io_page)) { 408 - io_end->pages[io_end->num_io_pages++] = io_page; 409 - atomic_inc(&io_page->p_count); 410 - } 364 + io_end = io->io_end; 365 + if (test_clear_buffer_uninit(bh)) 366 + ext4_set_io_unwritten_flag(inode, io_end); 367 + io_end->size += bh->b_size; 368 + io->io_next_block++; 411 369 return 0; 412 370 } 413 371 ··· 400 392 struct writeback_control *wbc) 401 393 { 402 394 struct inode *inode = page->mapping->host; 403 - unsigned block_start, block_end, blocksize; 404 - struct ext4_io_page *io_page; 395 + unsigned block_start, blocksize; 405 396 struct buffer_head *bh, *head; 406 397 int ret = 0; 398 + int nr_submitted = 0; 407 399 408 400 blocksize = 1 << inode->i_blkbits; 409 401 410 402 BUG_ON(!PageLocked(page)); 411 403 BUG_ON(PageWriteback(page)); 412 404 413 - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); 414 - if (!io_page) { 415 - redirty_page_for_writepage(wbc, page); 416 - unlock_page(page); 417 - return -ENOMEM; 418 - } 419 - io_page->p_page = page; 420 - atomic_set(&io_page->p_count, 1); 421 - get_page(page); 422 405 set_page_writeback(page); 423 406 ClearPageError(page); 424 407 425 - for (bh = head = page_buffers(page), block_start = 0; 426 - bh != head || !block_start; 427 - block_start = block_end, bh = bh->b_this_page) { 428 - 429 - block_end = block_start + blocksize; 408 + /* 409 + * In the first loop we prepare and mark buffers to submit. We have to 410 + * mark all buffers in the page before submitting so that 411 + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 412 + * on the first buffer finishes and we are still working on submitting 413 + * the second buffer. 414 + */ 415 + bh = head = page_buffers(page); 416 + do { 417 + block_start = bh_offset(bh); 430 418 if (block_start >= len) { 431 419 /* 432 420 * Comments copied from block_write_full_page_endio: ··· 435 431 * mapped, and writes to that region are not written 436 432 * out to the file." 437 433 */ 438 - zero_user_segment(page, block_start, block_end); 434 + zero_user_segment(page, block_start, 435 + block_start + blocksize); 439 436 clear_buffer_dirty(bh); 440 437 set_buffer_uptodate(bh); 441 438 continue; ··· 450 445 ext4_io_submit(io); 451 446 continue; 452 447 } 453 - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 448 + if (buffer_new(bh)) { 449 + clear_buffer_new(bh); 450 + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 451 + } 452 + set_buffer_async_write(bh); 453 + } while ((bh = bh->b_this_page) != head); 454 + 455 + /* Now submit buffers to write */ 456 + bh = head = page_buffers(page); 457 + do { 458 + if (!buffer_async_write(bh)) 459 + continue; 460 + ret = io_submit_add_bh(io, inode, bh); 454 461 if (ret) { 455 462 /* 456 463 * We only get here on ENOMEM. Not much else ··· 472 455 redirty_page_for_writepage(wbc, page); 473 456 break; 474 457 } 458 + nr_submitted++; 475 459 clear_buffer_dirty(bh); 460 + } while ((bh = bh->b_this_page) != head); 461 + 462 + /* Error stopped previous loop? Clean up buffers... */ 463 + if (ret) { 464 + do { 465 + clear_buffer_async_write(bh); 466 + bh = bh->b_this_page; 467 + } while (bh != head); 476 468 } 477 469 unlock_page(page); 478 - /* 479 - * If the page was truncated before we could do the writeback, 480 - * or we had a memory allocation error while trying to write 481 - * the first buffer head, we won't have submitted any pages for 482 - * I/O. In that case we need to make sure we've cleared the 483 - * PageWriteback bit from the page to prevent the system from 484 - * wedging later on. 485 - */ 486 - put_io_page(io_page); 470 + /* Nothing submitted - we have to end page writeback */ 471 + if (!nr_submitted) 472 + end_page_writeback(page); 487 473 return ret; 488 474 }

+11 -5

fs/ext4/resize.c

··· 272 272 if (start_blk >= last_blk) 273 273 goto next_group; 274 274 group_data[bb_index].block_bitmap = start_blk++; 275 - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 275 + group = ext4_get_group_number(sb, start_blk - 1); 276 276 group -= group_data[0].group; 277 277 group_data[group].free_blocks_count--; 278 278 if (flexbg_size > 1) ··· 284 284 if (start_blk >= last_blk) 285 285 goto next_group; 286 286 group_data[ib_index].inode_bitmap = start_blk++; 287 - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); 287 + group = ext4_get_group_number(sb, start_blk - 1); 288 288 group -= group_data[0].group; 289 289 group_data[group].free_blocks_count--; 290 290 if (flexbg_size > 1) ··· 296 296 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) 297 297 goto next_group; 298 298 group_data[it_index].inode_table = start_blk; 299 - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); 299 + group = ext4_get_group_number(sb, start_blk - 1); 300 300 group -= group_data[0].group; 301 301 group_data[group].free_blocks_count -= 302 302 EXT4_SB(sb)->s_itb_per_group; ··· 392 392 ext4_group_t group; 393 393 int err; 394 394 395 - ext4_get_group_no_and_offset(sb, block, &group, NULL); 395 + group = ext4_get_group_number(sb, block); 396 396 start = ext4_group_first_block_no(sb, group); 397 397 group -= flex_gd->groups[0].group; 398 398 ··· 1341 1341 1342 1342 /* Update the global fs size fields */ 1343 1343 sbi->s_groups_count += flex_gd->count; 1344 + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, 1345 + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 1344 1346 1345 1347 /* Update the reserved block counts only once the new group is 1346 1348 * active. */ ··· 1881 1879 /* Nothing need to do */ 1882 1880 return 0; 1883 1881 1884 - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1882 + n_group = ext4_get_group_number(sb, n_blocks_count - 1); 1883 + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { 1884 + ext4_warning(sb, "resize would cause inodes_count overflow"); 1885 + return -EINVAL; 1886 + } 1885 1887 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1886 1888 1887 1889 n_desc_blocks = num_desc_blocks(sb, n_group + 1);

+105 -26

fs/ext4/super.c

··· 81 81 static void ext4_destroy_lazyinit_thread(void); 82 82 static void ext4_unregister_li_request(struct super_block *sb); 83 83 static void ext4_clear_request_list(void); 84 + static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); 84 85 85 86 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 86 87 static struct file_system_type ext2_fs_type = { ··· 354 353 struct super_block *sb = journal->j_private; 355 354 struct ext4_sb_info *sbi = EXT4_SB(sb); 356 355 int error = is_journal_aborted(journal); 357 - struct ext4_journal_cb_entry *jce, *tmp; 356 + struct ext4_journal_cb_entry *jce; 358 357 358 + BUG_ON(txn->t_state == T_FINISHED); 359 359 spin_lock(&sbi->s_md_lock); 360 - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { 360 + while (!list_empty(&txn->t_private_list)) { 361 + jce = list_entry(txn->t_private_list.next, 362 + struct ext4_journal_cb_entry, jce_list); 361 363 list_del_init(&jce->jce_list); 362 364 spin_unlock(&sbi->s_md_lock); 363 365 jce->jce_func(sb, jce, error); ··· 1952 1948 if ((sbi->s_es->s_feature_ro_compat & 1953 1949 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { 1954 1950 /* Use new metadata_csum algorithm */ 1955 - __u16 old_csum; 1951 + __le16 save_csum; 1956 1952 __u32 csum32; 1957 1953 1958 - old_csum = gdp->bg_checksum; 1954 + save_csum = gdp->bg_checksum; 1959 1955 gdp->bg_checksum = 0; 1960 1956 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, 1961 1957 sizeof(le_group)); 1962 1958 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, 1963 1959 sbi->s_desc_size); 1964 - gdp->bg_checksum = old_csum; 1960 + gdp->bg_checksum = save_csum; 1965 1961 1966 1962 crc = csum32 & 0xFFFF; 1967 1963 goto out; ··· 2383 2379 int offset; 2384 2380 }; 2385 2381 2386 - static int parse_strtoul(const char *buf, 2387 - unsigned long max, unsigned long *value) 2382 + static int parse_strtoull(const char *buf, 2383 + unsigned long long max, unsigned long long *value) 2388 2384 { 2389 - char *endp; 2385 + int ret; 2390 2386 2391 - *value = simple_strtoul(skip_spaces(buf), &endp, 0); 2392 - endp = skip_spaces(endp); 2393 - if (*endp || *value > max) 2394 - return -EINVAL; 2395 - 2396 - return 0; 2387 + ret = kstrtoull(skip_spaces(buf), 0, value); 2388 + if (!ret && *value > max) 2389 + ret = -EINVAL; 2390 + return ret; 2397 2391 } 2398 2392 2399 2393 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, ··· 2433 2431 const char *buf, size_t count) 2434 2432 { 2435 2433 unsigned long t; 2434 + int ret; 2436 2435 2437 - if (parse_strtoul(buf, 0x40000000, &t)) 2438 - return -EINVAL; 2436 + ret = kstrtoul(skip_spaces(buf), 0, &t); 2437 + if (ret) 2438 + return ret; 2439 2439 2440 - if (t && !is_power_of_2(t)) 2440 + if (t && (!is_power_of_2(t) || t > 0x40000000)) 2441 2441 return -EINVAL; 2442 2442 2443 2443 sbi->s_inode_readahead_blks = t; ··· 2460 2456 { 2461 2457 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2462 2458 unsigned long t; 2459 + int ret; 2463 2460 2464 - if (parse_strtoul(buf, 0xffffffff, &t)) 2465 - return -EINVAL; 2461 + ret = kstrtoul(skip_spaces(buf), 0, &t); 2462 + if (ret) 2463 + return ret; 2466 2464 *ui = t; 2467 2465 return count; 2466 + } 2467 + 2468 + static ssize_t reserved_clusters_show(struct ext4_attr *a, 2469 + struct ext4_sb_info *sbi, char *buf) 2470 + { 2471 + return snprintf(buf, PAGE_SIZE, "%llu\n", 2472 + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); 2473 + } 2474 + 2475 + static ssize_t reserved_clusters_store(struct ext4_attr *a, 2476 + struct ext4_sb_info *sbi, 2477 + const char *buf, size_t count) 2478 + { 2479 + unsigned long long val; 2480 + int ret; 2481 + 2482 + if (parse_strtoull(buf, -1ULL, &val)) 2483 + return -EINVAL; 2484 + ret = ext4_reserve_clusters(sbi, val); 2485 + 2486 + return ret ? ret : count; 2468 2487 } 2469 2488 2470 2489 static ssize_t trigger_test_error(struct ext4_attr *a, ··· 2527 2500 EXT4_RO_ATTR(delayed_allocation_blocks); 2528 2501 EXT4_RO_ATTR(session_write_kbytes); 2529 2502 EXT4_RO_ATTR(lifetime_write_kbytes); 2503 + EXT4_RW_ATTR(reserved_clusters); 2530 2504 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2531 2505 inode_readahead_blks_store, s_inode_readahead_blks); 2532 2506 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); ··· 2545 2517 ATTR_LIST(delayed_allocation_blocks), 2546 2518 ATTR_LIST(session_write_kbytes), 2547 2519 ATTR_LIST(lifetime_write_kbytes), 2520 + ATTR_LIST(reserved_clusters), 2548 2521 ATTR_LIST(inode_readahead_blks), 2549 2522 ATTR_LIST(inode_goal), 2550 2523 ATTR_LIST(mb_stats), ··· 3221 3192 return 0; 3222 3193 } 3223 3194 3195 + 3196 + static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) 3197 + { 3198 + ext4_fsblk_t resv_clusters; 3199 + 3200 + /* 3201 + * By default we reserve 2% or 4096 clusters, whichever is smaller. 3202 + * This should cover the situations where we can not afford to run 3203 + * out of space like for example punch hole, or converting 3204 + * uninitialized extents in delalloc path. In most cases such 3205 + * allocation would require 1, or 2 blocks, higher numbers are 3206 + * very rare. 3207 + */ 3208 + resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; 3209 + 3210 + do_div(resv_clusters, 50); 3211 + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); 3212 + 3213 + return resv_clusters; 3214 + } 3215 + 3216 + 3217 + static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) 3218 + { 3219 + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> 3220 + sbi->s_cluster_bits; 3221 + 3222 + if (count >= clusters) 3223 + return -EINVAL; 3224 + 3225 + atomic64_set(&sbi->s_resv_clusters, count); 3226 + return 0; 3227 + } 3228 + 3224 3229 static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3225 3230 { 3226 3231 char *orig_data = kstrdup(data, GFP_KERNEL); ··· 3589 3526 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3590 3527 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3591 3528 3529 + /* Do we have standard group size of blocksize * 8 blocks ? */ 3530 + if (sbi->s_blocks_per_group == blocksize << 3) 3531 + set_opt2(sb, STD_GROUP_SIZE); 3532 + 3592 3533 for (i = 0; i < 4; i++) 3593 3534 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3594 3535 sbi->s_def_hash_version = es->s_def_hash_version; ··· 3765 3698 sbi->s_err_report.function = print_daily_error_info; 3766 3699 sbi->s_err_report.data = (unsigned long) sb; 3767 3700 3701 + /* Register extent status tree shrinker */ 3702 + ext4_es_register_shrinker(sb); 3703 + 3768 3704 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3769 3705 ext4_count_free_clusters(sb)); 3770 3706 if (!err) { ··· 3792 3722 sbi->s_stripe = ext4_get_stripe_size(sbi); 3793 3723 sbi->s_max_writeback_mb_bump = 128; 3794 3724 sbi->s_extent_max_zeroout_kb = 32; 3795 - 3796 - /* Register extent status tree shrinker */ 3797 - ext4_es_register_shrinker(sb); 3798 3725 3799 3726 /* 3800 3727 * set up enough so that it can read an inode ··· 3978 3911 "available"); 3979 3912 } 3980 3913 3914 + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); 3915 + if (err) { 3916 + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " 3917 + "reserved pool", ext4_calculate_resv_clusters(sbi)); 3918 + goto failed_mount4a; 3919 + } 3920 + 3981 3921 err = ext4_setup_system_zone(sb); 3982 3922 if (err) { 3983 3923 ext4_msg(sb, KERN_ERR, "failed to initialize system " ··· 4084 4010 sbi->s_journal = NULL; 4085 4011 } 4086 4012 failed_mount3: 4013 + ext4_es_unregister_shrinker(sb); 4087 4014 del_timer(&sbi->s_err_report); 4088 4015 if (sbi->s_flex_groups) 4089 4016 ext4_kvfree(sbi->s_flex_groups); ··· 4252 4177 goto out_bdev; 4253 4178 } 4254 4179 journal->j_private = sb; 4255 - ll_rw_block(READ, 1, &journal->j_sb_buffer); 4180 + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); 4256 4181 wait_on_buffer(journal->j_sb_buffer); 4257 4182 if (!buffer_uptodate(journal->j_sb_buffer)) { 4258 4183 ext4_msg(sb, KERN_ERR, "I/O error on journal device"); ··· 4817 4742 struct super_block *sb = dentry->d_sb; 4818 4743 struct ext4_sb_info *sbi = EXT4_SB(sb); 4819 4744 struct ext4_super_block *es = sbi->s_es; 4820 - ext4_fsblk_t overhead = 0; 4745 + ext4_fsblk_t overhead = 0, resv_blocks; 4821 4746 u64 fsid; 4822 4747 s64 bfree; 4748 + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); 4823 4749 4824 4750 if (!test_opt(sb, MINIX_DF)) 4825 4751 overhead = sbi->s_overhead; ··· 4832 4756 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4833 4757 /* prevent underflow in case that few free space is available */ 4834 4758 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); 4835 - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4836 - if (buf->f_bfree < ext4_r_blocks_count(es)) 4759 + buf->f_bavail = buf->f_bfree - 4760 + (ext4_r_blocks_count(es) + resv_blocks); 4761 + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) 4837 4762 buf->f_bavail = 0; 4838 4763 buf->f_files = le32_to_cpu(es->s_inodes_count); 4839 4764 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); ··· 5022 4945 return PTR_ERR(qf_inode); 5023 4946 } 5024 4947 4948 + /* Don't account quota for quota files to avoid recursion */ 4949 + qf_inode->i_flags |= S_NOQUOTA; 5025 4950 err = dquot_enable(qf_inode, type, format_id, flags); 5026 4951 iput(qf_inode); 5027 4952

+7 -6

fs/ext4/xattr.c

··· 122 122 struct ext4_xattr_header *hdr) 123 123 { 124 124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 125 - __u32 csum, old; 125 + __u32 csum; 126 + __le32 save_csum; 127 + __le64 dsk_block_nr = cpu_to_le64(block_nr); 126 128 127 - old = hdr->h_checksum; 129 + save_csum = hdr->h_checksum; 128 130 hdr->h_checksum = 0; 129 - block_nr = cpu_to_le64(block_nr); 130 - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, 131 - sizeof(block_nr)); 131 + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, 132 + sizeof(dsk_block_nr)); 132 133 csum = ext4_chksum(sbi, csum, (__u8 *)hdr, 133 134 EXT4_BLOCK_SIZE(inode->i_sb)); 134 135 135 - hdr->h_checksum = old; 136 + hdr->h_checksum = save_csum; 136 137 return cpu_to_le32(csum); 137 138 } 138 139

+1

fs/ext4/xattr.h

··· 22 22 #define EXT4_XATTR_INDEX_LUSTRE 5 23 23 #define EXT4_XATTR_INDEX_SECURITY 6 24 24 #define EXT4_XATTR_INDEX_SYSTEM 7 25 + #define EXT4_XATTR_INDEX_RICHACL 8 25 26 26 27 struct ext4_xattr_header { 27 28 __le32 h_magic; /* magic number for identification */

+28 -22

fs/jbd2/commit.c

··· 382 382 int space_left = 0; 383 383 int first_tag = 0; 384 384 int tag_flag; 385 - int i, to_free = 0; 385 + int i; 386 386 int tag_bytes = journal_tag_bytes(journal); 387 387 struct buffer_head *cbh = NULL; /* For transactional checksums */ 388 388 __u32 crc32_sum = ~0; ··· 1134 1134 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1135 1135 spin_unlock(&journal->j_history_lock); 1136 1136 1137 - commit_transaction->t_state = T_FINISHED; 1137 + commit_transaction->t_state = T_COMMIT_CALLBACK; 1138 1138 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1139 1139 journal->j_commit_sequence = commit_transaction->t_tid; 1140 1140 journal->j_committing_transaction = NULL; ··· 1149 1149 journal->j_average_commit_time*3) / 4; 1150 1150 else 1151 1151 journal->j_average_commit_time = commit_time; 1152 + 1152 1153 write_unlock(&journal->j_state_lock); 1153 1154 1154 - if (commit_transaction->t_checkpoint_list == NULL && 1155 - commit_transaction->t_checkpoint_io_list == NULL) { 1156 - __jbd2_journal_drop_transaction(journal, commit_transaction); 1157 - to_free = 1; 1155 + if (journal->j_checkpoint_transactions == NULL) { 1156 + journal->j_checkpoint_transactions = commit_transaction; 1157 + commit_transaction->t_cpnext = commit_transaction; 1158 + commit_transaction->t_cpprev = commit_transaction; 1158 1159 } else { 1159 - if (journal->j_checkpoint_transactions == NULL) { 1160 - journal->j_checkpoint_transactions = commit_transaction; 1161 - commit_transaction->t_cpnext = commit_transaction; 1162 - commit_transaction->t_cpprev = commit_transaction; 1163 - } else { 1164 - commit_transaction->t_cpnext = 1165 - journal->j_checkpoint_transactions; 1166 - commit_transaction->t_cpprev = 1167 - commit_transaction->t_cpnext->t_cpprev; 1168 - commit_transaction->t_cpnext->t_cpprev = 1160 + commit_transaction->t_cpnext = 1161 + journal->j_checkpoint_transactions; 1162 + commit_transaction->t_cpprev = 1163 + commit_transaction->t_cpnext->t_cpprev; 1164 + commit_transaction->t_cpnext->t_cpprev = 1165 + commit_transaction; 1166 + commit_transaction->t_cpprev->t_cpnext = 1169 1167 commit_transaction; 1170 - commit_transaction->t_cpprev->t_cpnext = 1171 - commit_transaction; 1172 - } 1173 1168 } 1174 1169 spin_unlock(&journal->j_list_lock); 1175 - 1170 + /* Drop all spin_locks because commit_callback may be block. 1171 + * __journal_remove_checkpoint() can not destroy transaction 1172 + * under us because it is not marked as T_FINISHED yet */ 1176 1173 if (journal->j_commit_callback) 1177 1174 journal->j_commit_callback(journal, commit_transaction); 1178 1175 1179 1176 trace_jbd2_end_commit(journal, commit_transaction); 1180 1177 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1181 1178 journal->j_commit_sequence, journal->j_tail_sequence); 1182 - if (to_free) 1183 - jbd2_journal_free_transaction(commit_transaction); 1184 1179 1180 + write_lock(&journal->j_state_lock); 1181 + spin_lock(&journal->j_list_lock); 1182 + commit_transaction->t_state = T_FINISHED; 1183 + /* Recheck checkpoint lists after j_list_lock was dropped */ 1184 + if (commit_transaction->t_checkpoint_list == NULL && 1185 + commit_transaction->t_checkpoint_io_list == NULL) { 1186 + __jbd2_journal_drop_transaction(journal, commit_transaction); 1187 + jbd2_journal_free_transaction(commit_transaction); 1188 + } 1189 + spin_unlock(&journal->j_list_lock); 1190 + write_unlock(&journal->j_state_lock); 1185 1191 wake_up(&journal->j_wait_done_commit); 1186 1192 }

+31

fs/jbd2/journal.c

··· 708 708 } 709 709 710 710 /* 711 + * When this function returns the transaction corresponding to tid 712 + * will be completed. If the transaction has currently running, start 713 + * committing that transaction before waiting for it to complete. If 714 + * the transaction id is stale, it is by definition already completed, 715 + * so just return SUCCESS. 716 + */ 717 + int jbd2_complete_transaction(journal_t *journal, tid_t tid) 718 + { 719 + int need_to_wait = 1; 720 + 721 + read_lock(&journal->j_state_lock); 722 + if (journal->j_running_transaction && 723 + journal->j_running_transaction->t_tid == tid) { 724 + if (journal->j_commit_request != tid) { 725 + /* transaction not yet started, so request it */ 726 + read_unlock(&journal->j_state_lock); 727 + jbd2_log_start_commit(journal, tid); 728 + goto wait_commit; 729 + } 730 + } else if (!(journal->j_committing_transaction && 731 + journal->j_committing_transaction->t_tid == tid)) 732 + need_to_wait = 0; 733 + read_unlock(&journal->j_state_lock); 734 + if (!need_to_wait) 735 + return 0; 736 + wait_commit: 737 + return jbd2_log_wait_commit(journal, tid); 738 + } 739 + EXPORT_SYMBOL(jbd2_complete_transaction); 740 + 741 + /* 711 742 * Log buffer allocation routines: 712 743 */ 713 744

+8 -1

fs/jbd2/transaction.c

··· 332 332 handle_t *handle = jbd2_alloc_handle(GFP_NOFS); 333 333 if (!handle) 334 334 return NULL; 335 - memset(handle, 0, sizeof(*handle)); 336 335 handle->h_buffer_credits = nblocks; 337 336 handle->h_ref = 1; 338 337 ··· 639 640 int error; 640 641 char *frozen_buffer = NULL; 641 642 int need_copy = 0; 643 + unsigned long start_lock, time_lock; 642 644 643 645 if (is_handle_aborted(handle)) 644 646 return -EROFS; ··· 655 655 656 656 /* @@@ Need to check for errors here at some point. */ 657 657 658 + start_lock = jiffies; 658 659 lock_buffer(bh); 659 660 jbd_lock_bh_state(bh); 661 + 662 + /* If it takes too long to lock the buffer, trace it */ 663 + time_lock = jbd2_time_diff(start_lock, jiffies); 664 + if (time_lock > HZ/10) 665 + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, 666 + jiffies_to_msecs(time_lock)); 660 667 661 668 /* We now hold the buffer lock so it is safe to query the buffer 662 669 * state. Is the buffer dirty?

+4

include/linux/buffer_head.h

··· 34 34 BH_Write_EIO, /* I/O error on write */ 35 35 BH_Unwritten, /* Buffer is allocated on disk but not written */ 36 36 BH_Quiet, /* Buffer Error Prinks to be quiet */ 37 + BH_Meta, /* Buffer contains metadata */ 38 + BH_Prio, /* Buffer should be submitted with REQ_PRIO */ 37 39 38 40 BH_PrivateStart,/* not a state bit, but the first bit available 39 41 * for private allocation by other entities ··· 126 124 BUFFER_FNS(Boundary, boundary) 127 125 BUFFER_FNS(Write_EIO, write_io_error) 128 126 BUFFER_FNS(Unwritten, unwritten) 127 + BUFFER_FNS(Meta, meta) 128 + BUFFER_FNS(Prio, prio) 129 129 130 130 #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) 131 131

+3 -1

include/linux/jbd2.h

··· 480 480 T_COMMIT, 481 481 T_COMMIT_DFLUSH, 482 482 T_COMMIT_JFLUSH, 483 + T_COMMIT_CALLBACK, 483 484 T_FINISHED 484 485 } t_state; 485 486 ··· 1145 1144 1146 1145 static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) 1147 1146 { 1148 - return kmem_cache_alloc(jbd2_handle_cache, gfp_flags); 1147 + return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); 1149 1148 } 1150 1149 1151 1150 static inline void jbd2_free_handle(handle_t *handle) ··· 1201 1200 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); 1202 1201 int jbd2_journal_force_commit_nested(journal_t *journal); 1203 1202 int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1203 + int jbd2_complete_transaction(journal_t *journal, tid_t tid); 1204 1204 int jbd2_log_do_checkpoint(journal_t *journal); 1205 1205 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); 1206 1206

+2 -9

include/linux/journal-head.h

··· 31 31 /* 32 32 * Journalling list for this buffer [jbd_lock_bh_state()] 33 33 */ 34 - unsigned b_jlist; 34 + unsigned b_jlist:4; 35 35 36 36 /* 37 37 * This flag signals the buffer has been modified by 38 38 * the currently running transaction 39 39 * [jbd_lock_bh_state()] 40 40 */ 41 - unsigned b_modified; 42 - 43 - /* 44 - * This feild tracks the last transaction id in which this buffer 45 - * has been cowed 46 - * [jbd_lock_bh_state()] 47 - */ 48 - tid_t b_cow_tid; 41 + unsigned b_modified:1; 49 42 50 43 /* 51 44 * Copy of the buffer data frozen for writing to the log.

+4 -12

include/trace/events/ext4.h

··· 257 257 __entry->pos, __entry->len, __entry->copied) 258 258 ); 259 259 260 - DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, 261 - 262 - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 263 - unsigned int copied), 264 - 265 - TP_ARGS(inode, pos, len, copied) 266 - ); 267 - 268 - DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, 260 + DEFINE_EVENT(ext4__write_end, ext4_write_end, 269 261 270 262 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, 271 263 unsigned int copied), ··· 1948 1956 __entry->to = to; 1949 1957 __entry->partial = partial_cluster; 1950 1958 __entry->ee_pblk = ext4_ext_pblock(ex); 1951 - __entry->ee_lblk = cpu_to_le32(ex->ee_block); 1959 + __entry->ee_lblk = le32_to_cpu(ex->ee_block); 1952 1960 __entry->ee_len = ext4_ext_get_actual_len(ex); 1953 1961 ), 1954 1962 ··· 2052 2060 2053 2061 TRACE_EVENT(ext4_ext_remove_space_done, 2054 2062 TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, 2055 - ext4_lblk_t partial, unsigned short eh_entries), 2063 + ext4_lblk_t partial, __le16 eh_entries), 2056 2064 2057 2065 TP_ARGS(inode, start, depth, partial, eh_entries), 2058 2066 ··· 2071 2079 __entry->start = start; 2072 2080 __entry->depth = depth; 2073 2081 __entry->partial = partial; 2074 - __entry->eh_entries = eh_entries; 2082 + __entry->eh_entries = le16_to_cpu(eh_entries); 2075 2083 ), 2076 2084 2077 2085 TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "

+21

include/trace/events/jbd2.h

··· 358 358 MINOR(__entry->dev), __entry->write_op) 359 359 ); 360 360 361 + TRACE_EVENT(jbd2_lock_buffer_stall, 362 + 363 + TP_PROTO(dev_t dev, unsigned long stall_ms), 364 + 365 + TP_ARGS(dev, stall_ms), 366 + 367 + TP_STRUCT__entry( 368 + __field( dev_t, dev ) 369 + __field(unsigned long, stall_ms ) 370 + ), 371 + 372 + TP_fast_assign( 373 + __entry->dev = dev; 374 + __entry->stall_ms = stall_ms; 375 + ), 376 + 377 + TP_printk("dev %d,%d stall_ms %lu", 378 + MAJOR(__entry->dev), MINOR(__entry->dev), 379 + __entry->stall_ms) 380 + ); 381 + 361 382 #endif /* _TRACE_JBD2_H */ 362 383 363 384 /* This part must be outside protection */