Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+1

fs/adfs/super.c

··· 212 212 213 213 static int adfs_remount(struct super_block *sb, int *flags, char *data) 214 214 { 215 + sync_filesystem(sb); 215 216 *flags |= MS_NODIRATIME; 216 217 return parse_options(sb, data); 217 218 }

+1

fs/affs/super.c

··· 530 530 531 531 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 532 532 533 + sync_filesystem(sb); 533 534 *flags |= MS_NODIRATIME; 534 535 535 536 memcpy(volume, sbi->s_volume, 32);

+1

fs/befs/linuxvfs.c

··· 913 913 static int 914 914 befs_remount(struct super_block *sb, int *flags, char *data) 915 915 { 916 + sync_filesystem(sb); 916 917 if (!(*flags & MS_RDONLY)) 917 918 return -EINVAL; 918 919 return 0;

+1

fs/btrfs/super.c

··· 1380 1380 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1381 1381 int ret; 1382 1382 1383 + sync_filesystem(sb); 1383 1384 btrfs_remount_prepare(fs_info); 1384 1385 1385 1386 ret = btrfs_parse_options(root, data);

+1

fs/cifs/cifsfs.c

··· 541 541 542 542 static int cifs_remount(struct super_block *sb, int *flags, char *data) 543 543 { 544 + sync_filesystem(sb); 544 545 *flags |= MS_NODIRATIME; 545 546 return 0; 546 547 }

+1

fs/coda/inode.c

··· 96 96 97 97 static int coda_remount(struct super_block *sb, int *flags, char *data) 98 98 { 99 + sync_filesystem(sb); 99 100 *flags |= MS_NOATIME; 100 101 return 0; 101 102 }

+1

fs/cramfs/inode.c

··· 243 243 244 244 static int cramfs_remount(struct super_block *sb, int *flags, char *data) 245 245 { 246 + sync_filesystem(sb); 246 247 *flags |= MS_RDONLY; 247 248 return 0; 248 249 }

+1

fs/debugfs/inode.c

··· 218 218 int err; 219 219 struct debugfs_fs_info *fsi = sb->s_fs_info; 220 220 221 + sync_filesystem(sb); 221 222 err = debugfs_parse_options(data, &fsi->mount_opts); 222 223 if (err) 223 224 goto fail;

+1

fs/devpts/inode.c

··· 313 313 struct pts_fs_info *fsi = DEVPTS_SB(sb); 314 314 struct pts_mount_opts *opts = &fsi->mount_opts; 315 315 316 + sync_filesystem(sb); 316 317 err = parse_mount_options(data, PARSE_REMOUNT, opts); 317 318 318 319 /*

+1

fs/efs/super.c

··· 114 114 115 115 static int efs_remount(struct super_block *sb, int *flags, char *data) 116 116 { 117 + sync_filesystem(sb); 117 118 *flags |= MS_RDONLY; 118 119 return 0; 119 120 }

+1

fs/ext2/super.c

··· 1254 1254 unsigned long old_sb_flags; 1255 1255 int err; 1256 1256 1257 + sync_filesystem(sb); 1257 1258 spin_lock(&sbi->s_lock); 1258 1259 1259 1260 /* Store the old options */

+2

fs/ext3/super.c

··· 2649 2649 int i; 2650 2650 #endif 2651 2651 2652 + sync_filesystem(sb); 2653 + 2652 2654 /* Store the original options */ 2653 2655 old_sb_flags = sb->s_flags; 2654 2656 old_opts.s_mount_opt = sbi->s_mount_opt;

+9 -2

fs/ext4/ext4.h

··· 31 31 #include <linux/percpu_counter.h> 32 32 #include <linux/ratelimit.h> 33 33 #include <crypto/hash.h> 34 + #include <linux/falloc.h> 34 35 #ifdef __KERNEL__ 35 36 #include <linux/compat.h> 36 37 #endif ··· 568 567 #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 569 568 /* Do not put hole in extent cache */ 570 569 #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 570 + /* Convert written extents to unwritten */ 571 + #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 571 572 572 573 /* 573 574 * The bit position of these flags must not overlap with any of the ··· 1001 998 #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group 1002 999 size of blocksize * 8 1003 1000 blocks */ 1001 + #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated 1002 + file systems */ 1004 1003 1005 1004 #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1006 1005 ~EXT4_MOUNT_##opt ··· 1331 1326 struct list_head s_es_lru; 1332 1327 unsigned long s_es_last_sorted; 1333 1328 struct percpu_counter s_extent_cache_cnt; 1329 + struct mb_cache *s_mb_cache; 1334 1330 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 1331 1336 1332 /* Ratelimit ext4 messages. */ ··· 2139 2133 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2140 2134 extern int ext4_block_truncate_page(handle_t *handle, 2141 2135 struct address_space *mapping, loff_t from); 2142 - extern int ext4_block_zero_page_range(handle_t *handle, 2143 - struct address_space *mapping, loff_t from, loff_t length); 2144 2136 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2145 2137 loff_t lstart, loff_t lend); 2146 2138 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); ··· 2761 2757 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2762 2758 __u64 start, __u64 len); 2763 2759 extern int ext4_ext_precache(struct inode *inode); 2760 + extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); 2764 2761 2765 2762 /* move_extent.c */ 2766 2763 extern void ext4_double_down_write_data_sem(struct inode *first, ··· 2771 2766 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2772 2767 __u64 start_orig, __u64 start_donor, 2773 2768 __u64 len, __u64 *moved_len); 2769 + extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 2770 + struct ext4_extent **extent); 2774 2771 2775 2772 /* page-io.c */ 2776 2773 extern int __init ext4_init_pageio(void);

+10

fs/ext4/ext4_jbd2.c

··· 259 259 if (WARN_ON_ONCE(err)) { 260 260 ext4_journal_abort_handle(where, line, __func__, bh, 261 261 handle, err); 262 + if (inode == NULL) { 263 + pr_err("EXT4: jbd2_journal_dirty_metadata " 264 + "failed: handle type %u started at " 265 + "line %u, credits %u/%u, errcode %d", 266 + handle->h_type, 267 + handle->h_line_no, 268 + handle->h_requested_credits, 269 + handle->h_buffer_credits, err); 270 + return err; 271 + } 262 272 ext4_error_inode(inode, where, line, 263 273 bh->b_blocknr, 264 274 "journal_dirty_metadata failed: "

+707 -113

fs/ext4/extents.c

··· 37 37 #include <linux/quotaops.h> 38 38 #include <linux/string.h> 39 39 #include <linux/slab.h> 40 - #include <linux/falloc.h> 41 40 #include <asm/uaccess.h> 42 41 #include <linux/fiemap.h> 43 42 #include "ext4_jbd2.h" ··· 1690 1691 * the extent that was written properly split out and conversion to 1691 1692 * initialized is trivial. 1692 1693 */ 1693 - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1694 + if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) 1694 1695 return 0; 1695 1696 1696 1697 ext1_ee_len = ext4_ext_get_actual_len(ex1); ··· 1706 1707 * this can result in the top bit of ee_len being set. 1707 1708 */ 1708 1709 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1710 + return 0; 1711 + if (ext4_ext_is_uninitialized(ex1) && 1712 + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || 1713 + atomic_read(&EXT4_I(inode)->i_unwritten) || 1714 + (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) 1709 1715 return 0; 1710 1716 #ifdef AGGRESSIVE_TEST 1711 1717 if (ext1_ee_len >= 4) ··· 1735 1731 { 1736 1732 struct ext4_extent_header *eh; 1737 1733 unsigned int depth, len; 1738 - int merge_done = 0; 1734 + int merge_done = 0, uninit; 1739 1735 1740 1736 depth = ext_depth(inode); 1741 1737 BUG_ON(path[depth].p_hdr == NULL); ··· 1745 1741 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1746 1742 break; 1747 1743 /* merge with next extent! */ 1744 + uninit = ext4_ext_is_uninitialized(ex); 1748 1745 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1749 1746 + ext4_ext_get_actual_len(ex + 1)); 1747 + if (uninit) 1748 + ext4_ext_mark_uninitialized(ex); 1750 1749 1751 1750 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1752 1751 len = (EXT_LAST_EXTENT(eh) - ex - 1) ··· 1903 1896 struct ext4_ext_path *npath = NULL; 1904 1897 int depth, len, err; 1905 1898 ext4_lblk_t next; 1906 - int mb_flags = 0; 1899 + int mb_flags = 0, uninit; 1907 1900 1908 1901 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1909 1902 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); ··· 1953 1946 path + depth); 1954 1947 if (err) 1955 1948 return err; 1956 - 1949 + uninit = ext4_ext_is_uninitialized(ex); 1957 1950 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1958 1951 + ext4_ext_get_actual_len(newext)); 1952 + if (uninit) 1953 + ext4_ext_mark_uninitialized(ex); 1959 1954 eh = path[depth].p_hdr; 1960 1955 nearex = ex; 1961 1956 goto merge; ··· 1980 1971 if (err) 1981 1972 return err; 1982 1973 1974 + uninit = ext4_ext_is_uninitialized(ex); 1983 1975 ex->ee_block = newext->ee_block; 1984 1976 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1985 1977 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1986 1978 + ext4_ext_get_actual_len(newext)); 1979 + if (uninit) 1980 + ext4_ext_mark_uninitialized(ex); 1987 1981 eh = path[depth].p_hdr; 1988 1982 nearex = ex; 1989 1983 goto merge; ··· 2597 2585 ex_ee_block = le32_to_cpu(ex->ee_block); 2598 2586 ex_ee_len = ext4_ext_get_actual_len(ex); 2599 2587 2588 + /* 2589 + * If we're starting with an extent other than the last one in the 2590 + * node, we need to see if it shares a cluster with the extent to 2591 + * the right (towards the end of the file). If its leftmost cluster 2592 + * is this extent's rightmost cluster and it is not cluster aligned, 2593 + * we'll mark it as a partial that is not to be deallocated. 2594 + */ 2595 + 2596 + if (ex != EXT_LAST_EXTENT(eh)) { 2597 + ext4_fsblk_t current_pblk, right_pblk; 2598 + long long current_cluster, right_cluster; 2599 + 2600 + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; 2601 + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); 2602 + right_pblk = ext4_ext_pblock(ex + 1); 2603 + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); 2604 + if (current_cluster == right_cluster && 2605 + EXT4_PBLK_COFF(sbi, right_pblk)) 2606 + *partial_cluster = -right_cluster; 2607 + } 2608 + 2600 2609 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2601 2610 2602 2611 while (ex >= EXT_FIRST_EXTENT(eh) && ··· 2743 2710 err = ext4_ext_correct_indexes(handle, inode, path); 2744 2711 2745 2712 /* 2746 - * Free the partial cluster only if the current extent does not 2747 - * reference it. Otherwise we might free used cluster. 2713 + * If there's a partial cluster and at least one extent remains in 2714 + * the leaf, free the partial cluster if it isn't shared with the 2715 + * current extent. If there's a partial cluster and no extents 2716 + * remain in the leaf, it can't be freed here. It can only be 2717 + * freed when it's possible to determine if it's not shared with 2718 + * any other extent - when the next leaf is processed or when space 2719 + * removal is complete. 2748 2720 */ 2749 - if (*partial_cluster > 0 && 2721 + if (*partial_cluster > 0 && eh->eh_entries && 2750 2722 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2751 2723 *partial_cluster)) { 2752 2724 int flags = get_default_free_blocks_flags(inode); ··· 3607 3569 * b> Splits in two extents: Write is happening at either end of the extent 3608 3570 * c> Splits in three extents: Somone is writing in middle of the extent 3609 3571 * 3572 + * This works the same way in the case of initialized -> unwritten conversion. 3573 + * 3610 3574 * One of more index blocks maybe needed if the extent tree grow after 3611 3575 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3612 3576 * complete, we need to split the uninitialized extent before DIO submit ··· 3619 3579 * 3620 3580 * Returns the size of uninitialized extent to be written on success. 3621 3581 */ 3622 - static int ext4_split_unwritten_extents(handle_t *handle, 3582 + static int ext4_split_convert_extents(handle_t *handle, 3623 3583 struct inode *inode, 3624 3584 struct ext4_map_blocks *map, 3625 3585 struct ext4_ext_path *path, ··· 3631 3591 unsigned int ee_len; 3632 3592 int split_flag = 0, depth; 3633 3593 3634 - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3635 - "block %llu, max_blocks %u\n", inode->i_ino, 3636 - (unsigned long long)map->m_lblk, map->m_len); 3594 + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", 3595 + __func__, inode->i_ino, 3596 + (unsigned long long)map->m_lblk, map->m_len); 3637 3597 3638 3598 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3639 3599 inode->i_sb->s_blocksize_bits; ··· 3648 3608 ee_block = le32_to_cpu(ex->ee_block); 3649 3609 ee_len = ext4_ext_get_actual_len(ex); 3650 3610 3651 - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3652 - split_flag |= EXT4_EXT_MARK_UNINIT2; 3653 - if (flags & EXT4_GET_BLOCKS_CONVERT) 3654 - split_flag |= EXT4_EXT_DATA_VALID2; 3611 + /* Convert to unwritten */ 3612 + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { 3613 + split_flag |= EXT4_EXT_DATA_VALID1; 3614 + /* Convert to initialized */ 3615 + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 3616 + split_flag |= ee_block + ee_len <= eof_block ? 3617 + EXT4_EXT_MAY_ZEROOUT : 0; 3618 + split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2); 3619 + } 3655 3620 flags |= EXT4_GET_BLOCKS_PRE_IO; 3656 3621 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3657 3622 } 3623 + 3624 + static int ext4_convert_initialized_extents(handle_t *handle, 3625 + struct inode *inode, 3626 + struct ext4_map_blocks *map, 3627 + struct ext4_ext_path *path) 3628 + { 3629 + struct ext4_extent *ex; 3630 + ext4_lblk_t ee_block; 3631 + unsigned int ee_len; 3632 + int depth; 3633 + int err = 0; 3634 + 3635 + depth = ext_depth(inode); 3636 + ex = path[depth].p_ext; 3637 + ee_block = le32_to_cpu(ex->ee_block); 3638 + ee_len = ext4_ext_get_actual_len(ex); 3639 + 3640 + ext_debug("%s: inode %lu, logical" 3641 + "block %llu, max_blocks %u\n", __func__, inode->i_ino, 3642 + (unsigned long long)ee_block, ee_len); 3643 + 3644 + if (ee_block != map->m_lblk || ee_len > map->m_len) { 3645 + err = ext4_split_convert_extents(handle, inode, map, path, 3646 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3647 + if (err < 0) 3648 + goto out; 3649 + ext4_ext_drop_refs(path); 3650 + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3651 + if (IS_ERR(path)) { 3652 + err = PTR_ERR(path); 3653 + goto out; 3654 + } 3655 + depth = ext_depth(inode); 3656 + ex = path[depth].p_ext; 3657 + } 3658 + 3659 + err = ext4_ext_get_access(handle, inode, path + depth); 3660 + if (err) 3661 + goto out; 3662 + /* first mark the extent as uninitialized */ 3663 + ext4_ext_mark_uninitialized(ex); 3664 + 3665 + /* note: ext4_ext_correct_indexes() isn't needed here because 3666 + * borders are not changed 3667 + */ 3668 + ext4_ext_try_to_merge(handle, inode, path, ex); 3669 + 3670 + /* Mark modified extent as dirty */ 3671 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3672 + out: 3673 + ext4_ext_show_leaf(inode, path); 3674 + return err; 3675 + } 3676 + 3658 3677 3659 3678 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3660 3679 struct inode *inode, ··· 3748 3649 inode->i_ino, (unsigned long long)ee_block, ee_len, 3749 3650 (unsigned long long)map->m_lblk, map->m_len); 3750 3651 #endif 3751 - err = ext4_split_unwritten_extents(handle, inode, map, path, 3752 - EXT4_GET_BLOCKS_CONVERT); 3652 + err = ext4_split_convert_extents(handle, inode, map, path, 3653 + EXT4_GET_BLOCKS_CONVERT); 3753 3654 if (err < 0) 3754 3655 goto out; 3755 3656 ext4_ext_drop_refs(path); ··· 3950 3851 } 3951 3852 3952 3853 static int 3854 + ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, 3855 + struct ext4_map_blocks *map, 3856 + struct ext4_ext_path *path, int flags, 3857 + unsigned int allocated, ext4_fsblk_t newblock) 3858 + { 3859 + int ret = 0; 3860 + int err = 0; 3861 + 3862 + /* 3863 + * Make sure that the extent is no bigger than we support with 3864 + * uninitialized extent 3865 + */ 3866 + if (map->m_len > EXT_UNINIT_MAX_LEN) 3867 + map->m_len = EXT_UNINIT_MAX_LEN / 2; 3868 + 3869 + ret = ext4_convert_initialized_extents(handle, inode, map, 3870 + path); 3871 + if (ret >= 0) { 3872 + ext4_update_inode_fsync_trans(handle, inode, 1); 3873 + err = check_eofblocks_fl(handle, inode, map->m_lblk, 3874 + path, map->m_len); 3875 + } else 3876 + err = ret; 3877 + map->m_flags |= EXT4_MAP_UNWRITTEN; 3878 + if (allocated > map->m_len) 3879 + allocated = map->m_len; 3880 + map->m_len = allocated; 3881 + 3882 + return err ? err : allocated; 3883 + } 3884 + 3885 + static int 3953 3886 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3954 3887 struct ext4_map_blocks *map, 3955 3888 struct ext4_ext_path *path, int flags, ··· 4008 3877 4009 3878 /* get_block() before submit the IO, split the extent */ 4010 3879 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4011 - ret = ext4_split_unwritten_extents(handle, inode, map, 4012 - path, flags); 3880 + ret = ext4_split_convert_extents(handle, inode, map, 3881 + path, flags | EXT4_GET_BLOCKS_CONVERT); 4013 3882 if (ret <= 0) 4014 3883 goto out; 4015 3884 /* ··· 4124 3993 map->m_pblk = newblock; 4125 3994 map->m_len = allocated; 4126 3995 out2: 4127 - if (path) { 4128 - ext4_ext_drop_refs(path); 4129 - kfree(path); 4130 - } 4131 3996 return err ? err : allocated; 4132 3997 } 4133 3998 ··· 4255 4128 struct ext4_extent newex, *ex, *ex2; 4256 4129 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4257 4130 ext4_fsblk_t newblock = 0; 4258 - int free_on_err = 0, err = 0, depth; 4131 + int free_on_err = 0, err = 0, depth, ret; 4259 4132 unsigned int allocated = 0, offset = 0; 4260 4133 unsigned int allocated_clusters = 0; 4261 4134 struct ext4_allocation_request ar; ··· 4297 4170 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4298 4171 unsigned short ee_len; 4299 4172 4173 + 4300 4174 /* 4301 4175 * Uninitialized extents are treated as holes, except that 4302 4176 * we split out initialized portions during a write. ··· 4314 4186 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4315 4187 ee_block, ee_len, newblock); 4316 4188 4317 - if (!ext4_ext_is_uninitialized(ex)) 4189 + /* 4190 + * If the extent is initialized check whether the 4191 + * caller wants to convert it to unwritten. 4192 + */ 4193 + if ((!ext4_ext_is_uninitialized(ex)) && 4194 + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4195 + allocated = ext4_ext_convert_initialized_extent( 4196 + handle, inode, map, path, flags, 4197 + allocated, newblock); 4198 + goto out2; 4199 + } else if (!ext4_ext_is_uninitialized(ex)) 4318 4200 goto out; 4319 4201 4320 - allocated = ext4_ext_handle_uninitialized_extents( 4202 + ret = ext4_ext_handle_uninitialized_extents( 4321 4203 handle, inode, map, path, flags, 4322 4204 allocated, newblock); 4323 - goto out3; 4205 + if (ret < 0) 4206 + err = ret; 4207 + else 4208 + allocated = ret; 4209 + goto out2; 4324 4210 } 4325 4211 } 4326 4212 ··· 4615 4473 kfree(path); 4616 4474 } 4617 4475 4618 - out3: 4619 4476 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4620 4477 err ? err : allocated); 4621 4478 ext4_es_lru_add(inode); ··· 4655 4514 ext4_std_error(inode->i_sb, err); 4656 4515 } 4657 4516 4658 - static void ext4_falloc_update_inode(struct inode *inode, 4659 - int mode, loff_t new_size, int update_ctime) 4517 + static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4518 + ext4_lblk_t len, int flags, int mode) 4660 4519 { 4661 - struct timespec now; 4520 + struct inode *inode = file_inode(file); 4521 + handle_t *handle; 4522 + int ret = 0; 4523 + int ret2 = 0; 4524 + int retries = 0; 4525 + struct ext4_map_blocks map; 4526 + unsigned int credits; 4662 4527 4663 - if (update_ctime) { 4664 - now = current_fs_time(inode->i_sb); 4665 - if (!timespec_equal(&inode->i_ctime, &now)) 4666 - inode->i_ctime = now; 4667 - } 4528 + map.m_lblk = offset; 4668 4529 /* 4669 - * Update only when preallocation was requested beyond 4670 - * the file size. 4530 + * Don't normalize the request if it can fit in one extent so 4531 + * that it doesn't get unnecessarily split into multiple 4532 + * extents. 4671 4533 */ 4672 - if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4534 + if (len <= EXT_UNINIT_MAX_LEN) 4535 + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4536 + 4537 + /* 4538 + * credits to insert 1 extent into extent tree 4539 + */ 4540 + credits = ext4_chunk_trans_blocks(inode, len); 4541 + 4542 + retry: 4543 + while (ret >= 0 && ret < len) { 4544 + map.m_lblk = map.m_lblk + ret; 4545 + map.m_len = len = len - ret; 4546 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4547 + credits); 4548 + if (IS_ERR(handle)) { 4549 + ret = PTR_ERR(handle); 4550 + break; 4551 + } 4552 + ret = ext4_map_blocks(handle, inode, &map, flags); 4553 + if (ret <= 0) { 4554 + ext4_debug("inode #%lu: block %u: len %u: " 4555 + "ext4_ext_map_blocks returned %d", 4556 + inode->i_ino, map.m_lblk, 4557 + map.m_len, ret); 4558 + ext4_mark_inode_dirty(handle, inode); 4559 + ret2 = ext4_journal_stop(handle); 4560 + break; 4561 + } 4562 + ret2 = ext4_journal_stop(handle); 4563 + if (ret2) 4564 + break; 4565 + } 4566 + if (ret == -ENOSPC && 4567 + ext4_should_retry_alloc(inode->i_sb, &retries)) { 4568 + ret = 0; 4569 + goto retry; 4570 + } 4571 + 4572 + return ret > 0 ? ret2 : ret; 4573 + } 4574 + 4575 + static long ext4_zero_range(struct file *file, loff_t offset, 4576 + loff_t len, int mode) 4577 + { 4578 + struct inode *inode = file_inode(file); 4579 + handle_t *handle = NULL; 4580 + unsigned int max_blocks; 4581 + loff_t new_size = 0; 4582 + int ret = 0; 4583 + int flags; 4584 + int partial; 4585 + loff_t start, end; 4586 + ext4_lblk_t lblk; 4587 + struct address_space *mapping = inode->i_mapping; 4588 + unsigned int blkbits = inode->i_blkbits; 4589 + 4590 + trace_ext4_zero_range(inode, offset, len, mode); 4591 + 4592 + /* 4593 + * Write out all dirty pages to avoid race conditions 4594 + * Then release them. 4595 + */ 4596 + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4597 + ret = filemap_write_and_wait_range(mapping, offset, 4598 + offset + len - 1); 4599 + if (ret) 4600 + return ret; 4601 + } 4602 + 4603 + /* 4604 + * Round up offset. This is not fallocate, we neet to zero out 4605 + * blocks, so convert interior block aligned part of the range to 4606 + * unwritten and possibly manually zero out unaligned parts of the 4607 + * range. 4608 + */ 4609 + start = round_up(offset, 1 << blkbits); 4610 + end = round_down((offset + len), 1 << blkbits); 4611 + 4612 + if (start < offset || end > offset + len) 4613 + return -EINVAL; 4614 + partial = (offset + len) & ((1 << blkbits) - 1); 4615 + 4616 + lblk = start >> blkbits; 4617 + max_blocks = (end >> blkbits); 4618 + if (max_blocks < lblk) 4619 + max_blocks = 0; 4620 + else 4621 + max_blocks -= lblk; 4622 + 4623 + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | 4624 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; 4625 + if (mode & FALLOC_FL_KEEP_SIZE) 4626 + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4627 + 4628 + mutex_lock(&inode->i_mutex); 4629 + 4630 + /* 4631 + * Indirect files do not support unwritten extnets 4632 + */ 4633 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4634 + ret = -EOPNOTSUPP; 4635 + goto out_mutex; 4636 + } 4637 + 4638 + if (!(mode & FALLOC_FL_KEEP_SIZE) && 4639 + offset + len > i_size_read(inode)) { 4640 + new_size = offset + len; 4641 + ret = inode_newsize_ok(inode, new_size); 4642 + if (ret) 4643 + goto out_mutex; 4644 + /* 4645 + * If we have a partial block after EOF we have to allocate 4646 + * the entire block. 4647 + */ 4648 + if (partial) 4649 + max_blocks += 1; 4650 + } 4651 + 4652 + if (max_blocks > 0) { 4653 + 4654 + /* Now release the pages and zero block aligned part of pages*/ 4655 + truncate_pagecache_range(inode, start, end - 1); 4656 + 4657 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 4658 + ext4_inode_block_unlocked_dio(inode); 4659 + inode_dio_wait(inode); 4660 + 4661 + /* 4662 + * Remove entire range from the extent status tree. 4663 + */ 4664 + ret = ext4_es_remove_extent(inode, lblk, max_blocks); 4665 + if (ret) 4666 + goto out_dio; 4667 + 4668 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, 4669 + mode); 4670 + if (ret) 4671 + goto out_dio; 4672 + } 4673 + 4674 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); 4675 + if (IS_ERR(handle)) { 4676 + ret = PTR_ERR(handle); 4677 + ext4_std_error(inode->i_sb, ret); 4678 + goto out_dio; 4679 + } 4680 + 4681 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4682 + 4683 + if (new_size) { 4673 4684 if (new_size > i_size_read(inode)) 4674 4685 i_size_write(inode, new_size); 4675 4686 if (new_size > EXT4_I(inode)->i_disksize) 4676 4687 ext4_update_i_disksize(inode, new_size); 4677 4688 } else { 4678 4689 /* 4679 - * Mark that we allocate beyond EOF so the subsequent truncate 4680 - * can proceed even if the new size is the same as i_size. 4681 - */ 4682 - if (new_size > i_size_read(inode)) 4690 + * Mark that we allocate beyond EOF so the subsequent truncate 4691 + * can proceed even if the new size is the same as i_size. 4692 + */ 4693 + if ((offset + len) > i_size_read(inode)) 4683 4694 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4684 4695 } 4685 4696 4697 + ext4_mark_inode_dirty(handle, inode); 4698 + 4699 + /* Zero out partial block at the edges of the range */ 4700 + ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4701 + 4702 + if (file->f_flags & O_SYNC) 4703 + ext4_handle_sync(handle); 4704 + 4705 + ext4_journal_stop(handle); 4706 + out_dio: 4707 + ext4_inode_resume_unlocked_dio(inode); 4708 + out_mutex: 4709 + mutex_unlock(&inode->i_mutex); 4710 + return ret; 4686 4711 } 4687 4712 4688 4713 /* ··· 4862 4555 { 4863 4556 struct inode *inode = file_inode(file); 4864 4557 handle_t *handle; 4865 - loff_t new_size; 4558 + loff_t new_size = 0; 4866 4559 unsigned int max_blocks; 4867 4560 int ret = 0; 4868 - int ret2 = 0; 4869 - int retries = 0; 4870 4561 int flags; 4871 - struct ext4_map_blocks map; 4872 - unsigned int credits, blkbits = inode->i_blkbits; 4562 + ext4_lblk_t lblk; 4563 + struct timespec tv; 4564 + unsigned int blkbits = inode->i_blkbits; 4873 4565 4874 4566 /* Return error if mode is not supported */ 4875 - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4567 + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 4568 + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) 4876 4569 return -EOPNOTSUPP; 4877 4570 4878 4571 if (mode & FALLOC_FL_PUNCH_HOLE) 4879 4572 return ext4_punch_hole(inode, offset, len); 4573 + 4574 + if (mode & FALLOC_FL_COLLAPSE_RANGE) 4575 + return ext4_collapse_range(inode, offset, len); 4880 4576 4881 4577 ret = ext4_convert_inline_data(inode); 4882 4578 if (ret) ··· 4892 4582 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4893 4583 return -EOPNOTSUPP; 4894 4584 4585 + if (mode & FALLOC_FL_ZERO_RANGE) 4586 + return ext4_zero_range(file, offset, len, mode); 4587 + 4895 4588 trace_ext4_fallocate_enter(inode, offset, len, mode); 4896 - map.m_lblk = offset >> blkbits; 4589 + lblk = offset >> blkbits; 4897 4590 /* 4898 4591 * We can't just convert len to max_blocks because 4899 4592 * If blocksize = 4096 offset = 3072 and len = 2048 4900 4593 */ 4901 4594 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4902 - - map.m_lblk; 4903 - /* 4904 - * credits to insert 1 extent into extent tree 4905 - */ 4906 - credits = ext4_chunk_trans_blocks(inode, max_blocks); 4907 - mutex_lock(&inode->i_mutex); 4908 - ret = inode_newsize_ok(inode, (len + offset)); 4909 - if (ret) { 4910 - mutex_unlock(&inode->i_mutex); 4911 - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4912 - return ret; 4913 - } 4595 + - lblk; 4596 + 4914 4597 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4915 4598 if (mode & FALLOC_FL_KEEP_SIZE) 4916 4599 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4917 - /* 4918 - * Don't normalize the request if it can fit in one extent so 4919 - * that it doesn't get unnecessarily split into multiple 4920 - * extents. 4921 - */ 4922 - if (len <= EXT_UNINIT_MAX_LEN << blkbits) 4923 - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4924 4600 4925 - retry: 4926 - while (ret >= 0 && ret < max_blocks) { 4927 - map.m_lblk = map.m_lblk + ret; 4928 - map.m_len = max_blocks = max_blocks - ret; 4929 - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4930 - credits); 4931 - if (IS_ERR(handle)) { 4932 - ret = PTR_ERR(handle); 4933 - break; 4934 - } 4935 - ret = ext4_map_blocks(handle, inode, &map, flags); 4936 - if (ret <= 0) { 4937 - #ifdef EXT4FS_DEBUG 4938 - ext4_warning(inode->i_sb, 4939 - "inode #%lu: block %u: len %u: " 4940 - "ext4_ext_map_blocks returned %d", 4941 - inode->i_ino, map.m_lblk, 4942 - map.m_len, ret); 4943 - #endif 4944 - ext4_mark_inode_dirty(handle, inode); 4945 - ret2 = ext4_journal_stop(handle); 4946 - break; 4947 - } 4948 - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 4949 - blkbits) >> blkbits)) 4950 - new_size = offset + len; 4951 - else 4952 - new_size = ((loff_t) map.m_lblk + ret) << blkbits; 4601 + mutex_lock(&inode->i_mutex); 4953 4602 4954 - ext4_falloc_update_inode(inode, mode, new_size, 4955 - (map.m_flags & EXT4_MAP_NEW)); 4956 - ext4_mark_inode_dirty(handle, inode); 4957 - if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4958 - ext4_handle_sync(handle); 4959 - ret2 = ext4_journal_stop(handle); 4960 - if (ret2) 4961 - break; 4603 + if (!(mode & FALLOC_FL_KEEP_SIZE) && 4604 + offset + len > i_size_read(inode)) { 4605 + new_size = offset + len; 4606 + ret = inode_newsize_ok(inode, new_size); 4607 + if (ret) 4608 + goto out; 4962 4609 } 4963 - if (ret == -ENOSPC && 4964 - ext4_should_retry_alloc(inode->i_sb, &retries)) { 4965 - ret = 0; 4966 - goto retry; 4610 + 4611 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); 4612 + if (ret) 4613 + goto out; 4614 + 4615 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4616 + if (IS_ERR(handle)) 4617 + goto out; 4618 + 4619 + tv = inode->i_ctime = ext4_current_time(inode); 4620 + 4621 + if (new_size) { 4622 + if (new_size > i_size_read(inode)) { 4623 + i_size_write(inode, new_size); 4624 + inode->i_mtime = tv; 4625 + } 4626 + if (new_size > EXT4_I(inode)->i_disksize) 4627 + ext4_update_i_disksize(inode, new_size); 4628 + } else { 4629 + /* 4630 + * Mark that we allocate beyond EOF so the subsequent truncate 4631 + * can proceed even if the new size is the same as i_size. 4632 + */ 4633 + if ((offset + len) > i_size_read(inode)) 4634 + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4967 4635 } 4636 + ext4_mark_inode_dirty(handle, inode); 4637 + if (file->f_flags & O_SYNC) 4638 + ext4_handle_sync(handle); 4639 + 4640 + ext4_journal_stop(handle); 4641 + out: 4968 4642 mutex_unlock(&inode->i_mutex); 4969 - trace_ext4_fallocate_exit(inode, offset, max_blocks, 4970 - ret > 0 ? ret2 : ret); 4971 - return ret > 0 ? ret2 : ret; 4643 + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4644 + return ret; 4972 4645 } 4973 4646 4974 4647 /* ··· 5161 4868 } 5162 4869 ext4_es_lru_add(inode); 5163 4870 return error; 4871 + } 4872 + 4873 + /* 4874 + * ext4_access_path: 4875 + * Function to access the path buffer for marking it dirty. 4876 + * It also checks if there are sufficient credits left in the journal handle 4877 + * to update path. 4878 + */ 4879 + static int 4880 + ext4_access_path(handle_t *handle, struct inode *inode, 4881 + struct ext4_ext_path *path) 4882 + { 4883 + int credits, err; 4884 + 4885 + if (!ext4_handle_valid(handle)) 4886 + return 0; 4887 + 4888 + /* 4889 + * Check if need to extend journal credits 4890 + * 3 for leaf, sb, and inode plus 2 (bmap and group 4891 + * descriptor) for each block group; assume two block 4892 + * groups 4893 + */ 4894 + if (handle->h_buffer_credits < 7) { 4895 + credits = ext4_writepage_trans_blocks(inode); 4896 + err = ext4_ext_truncate_extend_restart(handle, inode, credits); 4897 + /* EAGAIN is success */ 4898 + if (err && err != -EAGAIN) 4899 + return err; 4900 + } 4901 + 4902 + err = ext4_ext_get_access(handle, inode, path); 4903 + return err; 4904 + } 4905 + 4906 + /* 4907 + * ext4_ext_shift_path_extents: 4908 + * Shift the extents of a path structure lying between path[depth].p_ext 4909 + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift 4910 + * from starting block for each extent. 4911 + */ 4912 + static int 4913 + ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, 4914 + struct inode *inode, handle_t *handle, 4915 + ext4_lblk_t *start) 4916 + { 4917 + int depth, err = 0; 4918 + struct ext4_extent *ex_start, *ex_last; 4919 + bool update = 0; 4920 + depth = path->p_depth; 4921 + 4922 + while (depth >= 0) { 4923 + if (depth == path->p_depth) { 4924 + ex_start = path[depth].p_ext; 4925 + if (!ex_start) 4926 + return -EIO; 4927 + 4928 + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 4929 + if (!ex_last) 4930 + return -EIO; 4931 + 4932 + err = ext4_access_path(handle, inode, path + depth); 4933 + if (err) 4934 + goto out; 4935 + 4936 + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) 4937 + update = 1; 4938 + 4939 + *start = ex_last->ee_block + 4940 + ext4_ext_get_actual_len(ex_last); 4941 + 4942 + while (ex_start <= ex_last) { 4943 + ex_start->ee_block -= shift; 4944 + if (ex_start > 4945 + EXT_FIRST_EXTENT(path[depth].p_hdr)) { 4946 + if (ext4_ext_try_to_merge_right(inode, 4947 + path, ex_start - 1)) 4948 + ex_last--; 4949 + } 4950 + ex_start++; 4951 + } 4952 + err = ext4_ext_dirty(handle, inode, path + depth); 4953 + if (err) 4954 + goto out; 4955 + 4956 + if (--depth < 0 || !update) 4957 + break; 4958 + } 4959 + 4960 + /* Update index too */ 4961 + err = ext4_access_path(handle, inode, path + depth); 4962 + if (err) 4963 + goto out; 4964 + 4965 + path[depth].p_idx->ei_block -= shift; 4966 + err = ext4_ext_dirty(handle, inode, path + depth); 4967 + if (err) 4968 + goto out; 4969 + 4970 + /* we are done if current index is not a starting index */ 4971 + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) 4972 + break; 4973 + 4974 + depth--; 4975 + } 4976 + 4977 + out: 4978 + return err; 4979 + } 4980 + 4981 + /* 4982 + * ext4_ext_shift_extents: 4983 + * All the extents which lies in the range from start to the last allocated 4984 + * block for the file are shifted downwards by shift blocks. 4985 + * On success, 0 is returned, error otherwise. 4986 + */ 4987 + static int 4988 + ext4_ext_shift_extents(struct inode *inode, handle_t *handle, 4989 + ext4_lblk_t start, ext4_lblk_t shift) 4990 + { 4991 + struct ext4_ext_path *path; 4992 + int ret = 0, depth; 4993 + struct ext4_extent *extent; 4994 + ext4_lblk_t stop_block, current_block; 4995 + ext4_lblk_t ex_start, ex_end; 4996 + 4997 + /* Let path point to the last extent */ 4998 + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); 4999 + if (IS_ERR(path)) 5000 + return PTR_ERR(path); 5001 + 5002 + depth = path->p_depth; 5003 + extent = path[depth].p_ext; 5004 + if (!extent) { 5005 + ext4_ext_drop_refs(path); 5006 + kfree(path); 5007 + return ret; 5008 + } 5009 + 5010 + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); 5011 + ext4_ext_drop_refs(path); 5012 + kfree(path); 5013 + 5014 + /* Nothing to shift, if hole is at the end of file */ 5015 + if (start >= stop_block) 5016 + return ret; 5017 + 5018 + /* 5019 + * Don't start shifting extents until we make sure the hole is big 5020 + * enough to accomodate the shift. 5021 + */ 5022 + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); 5023 + depth = path->p_depth; 5024 + extent = path[depth].p_ext; 5025 + ex_start = extent->ee_block; 5026 + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); 5027 + ext4_ext_drop_refs(path); 5028 + kfree(path); 5029 + 5030 + if ((start == ex_start && shift > ex_start) || 5031 + (shift > start - ex_end)) 5032 + return -EINVAL; 5033 + 5034 + /* Its safe to start updating extents */ 5035 + while (start < stop_block) { 5036 + path = ext4_ext_find_extent(inode, start, NULL, 0); 5037 + if (IS_ERR(path)) 5038 + return PTR_ERR(path); 5039 + depth = path->p_depth; 5040 + extent = path[depth].p_ext; 5041 + current_block = extent->ee_block; 5042 + if (start > current_block) { 5043 + /* Hole, move to the next extent */ 5044 + ret = mext_next_extent(inode, path, &extent); 5045 + if (ret != 0) { 5046 + ext4_ext_drop_refs(path); 5047 + kfree(path); 5048 + if (ret == 1) 5049 + ret = 0; 5050 + break; 5051 + } 5052 + } 5053 + ret = ext4_ext_shift_path_extents(path, shift, inode, 5054 + handle, &start); 5055 + ext4_ext_drop_refs(path); 5056 + kfree(path); 5057 + if (ret) 5058 + break; 5059 + } 5060 + 5061 + return ret; 5062 + } 5063 + 5064 + /* 5065 + * ext4_collapse_range: 5066 + * This implements the fallocate's collapse range functionality for ext4 5067 + * Returns: 0 and non-zero on error. 5068 + */ 5069 + int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) 5070 + { 5071 + struct super_block *sb = inode->i_sb; 5072 + ext4_lblk_t punch_start, punch_stop; 5073 + handle_t *handle; 5074 + unsigned int credits; 5075 + loff_t new_size; 5076 + int ret; 5077 + 5078 + BUG_ON(offset + len > i_size_read(inode)); 5079 + 5080 + /* Collapse range works only on fs block size aligned offsets. */ 5081 + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || 5082 + len & (EXT4_BLOCK_SIZE(sb) - 1)) 5083 + return -EINVAL; 5084 + 5085 + if (!S_ISREG(inode->i_mode)) 5086 + return -EOPNOTSUPP; 5087 + 5088 + trace_ext4_collapse_range(inode, offset, len); 5089 + 5090 + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5091 + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5092 + 5093 + /* Write out all dirty pages */ 5094 + ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); 5095 + if (ret) 5096 + return ret; 5097 + 5098 + /* Take mutex lock */ 5099 + mutex_lock(&inode->i_mutex); 5100 + 5101 + /* It's not possible punch hole on append only file */ 5102 + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 5103 + ret = -EPERM; 5104 + goto out_mutex; 5105 + } 5106 + 5107 + if (IS_SWAPFILE(inode)) { 5108 + ret = -ETXTBSY; 5109 + goto out_mutex; 5110 + } 5111 + 5112 + /* Currently just for extent based files */ 5113 + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5114 + ret = -EOPNOTSUPP; 5115 + goto out_mutex; 5116 + } 5117 + 5118 + truncate_pagecache_range(inode, offset, -1); 5119 + 5120 + /* Wait for existing dio to complete */ 5121 + ext4_inode_block_unlocked_dio(inode); 5122 + inode_dio_wait(inode); 5123 + 5124 + credits = ext4_writepage_trans_blocks(inode); 5125 + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5126 + if (IS_ERR(handle)) { 5127 + ret = PTR_ERR(handle); 5128 + goto out_dio; 5129 + } 5130 + 5131 + down_write(&EXT4_I(inode)->i_data_sem); 5132 + ext4_discard_preallocations(inode); 5133 + 5134 + ret = ext4_es_remove_extent(inode, punch_start, 5135 + EXT_MAX_BLOCKS - punch_start - 1); 5136 + if (ret) { 5137 + up_write(&EXT4_I(inode)->i_data_sem); 5138 + goto out_stop; 5139 + } 5140 + 5141 + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); 5142 + if (ret) { 5143 + up_write(&EXT4_I(inode)->i_data_sem); 5144 + goto out_stop; 5145 + } 5146 + 5147 + ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5148 + punch_stop - punch_start); 5149 + if (ret) { 5150 + up_write(&EXT4_I(inode)->i_data_sem); 5151 + goto out_stop; 5152 + } 5153 + 5154 + new_size = i_size_read(inode) - len; 5155 + truncate_setsize(inode, new_size); 5156 + EXT4_I(inode)->i_disksize = new_size; 5157 + 5158 + ext4_discard_preallocations(inode); 5159 + up_write(&EXT4_I(inode)->i_data_sem); 5160 + if (IS_SYNC(inode)) 5161 + ext4_handle_sync(handle); 5162 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 5163 + ext4_mark_inode_dirty(handle, inode); 5164 + 5165 + out_stop: 5166 + ext4_journal_stop(handle); 5167 + out_dio: 5168 + ext4_inode_resume_unlocked_dio(inode); 5169 + out_mutex: 5170 + mutex_unlock(&inode->i_mutex); 5171 + return ret; 5164 5172 }

+13 -15

fs/ext4/extents_status.c

··· 184 184 while (node) { 185 185 struct extent_status *es; 186 186 es = rb_entry(node, struct extent_status, rb_node); 187 - printk(KERN_DEBUG " [%u/%u) %llu %llx", 187 + printk(KERN_DEBUG " [%u/%u) %llu %x", 188 188 es->es_lblk, es->es_len, 189 189 ext4_es_pblock(es), ext4_es_status(es)); 190 190 node = rb_next(node); ··· 445 445 pr_warn("ES insert assertion failed for " 446 446 "inode: %lu we can find an extent " 447 447 "at block [%d/%d/%llu/%c], but we " 448 - "want to add an delayed/hole extent " 449 - "[%d/%d/%llu/%llx]\n", 448 + "want to add a delayed/hole extent " 449 + "[%d/%d/%llu/%x]\n", 450 450 inode->i_ino, ee_block, ee_len, 451 451 ee_start, ee_status ? 'u' : 'w', 452 452 es->es_lblk, es->es_len, ··· 486 486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 487 487 pr_warn("ES insert assertion failed for inode: %lu " 488 488 "can't find an extent at block %d but we want " 489 - "to add an written/unwritten extent " 490 - "[%d/%d/%llu/%llx]\n", inode->i_ino, 489 + "to add a written/unwritten extent " 490 + "[%d/%d/%llu/%x]\n", inode->i_ino, 491 491 es->es_lblk, es->es_lblk, es->es_len, 492 492 ext4_es_pblock(es), ext4_es_status(es)); 493 493 } ··· 524 524 */ 525 525 pr_warn("ES insert assertion failed for inode: %lu " 526 526 "We can find blocks but we want to add a " 527 - "delayed/hole extent [%d/%d/%llu/%llx]\n", 527 + "delayed/hole extent [%d/%d/%llu/%x]\n", 528 528 inode->i_ino, es->es_lblk, es->es_len, 529 529 ext4_es_pblock(es), ext4_es_status(es)); 530 530 return; ··· 554 554 if (ext4_es_is_written(es)) { 555 555 pr_warn("ES insert assertion failed for inode: %lu " 556 556 "We can't find the block but we want to add " 557 - "an written extent [%d/%d/%llu/%llx]\n", 557 + "a written extent [%d/%d/%llu/%x]\n", 558 558 inode->i_ino, es->es_lblk, es->es_len, 559 559 ext4_es_pblock(es), ext4_es_status(es)); 560 560 return; ··· 658 658 659 659 newes.es_lblk = lblk; 660 660 newes.es_len = len; 661 - ext4_es_store_pblock(&newes, pblk); 662 - ext4_es_store_status(&newes, status); 661 + ext4_es_store_pblock_status(&newes, pblk, status); 663 662 trace_ext4_es_insert_extent(inode, &newes); 664 663 665 664 ext4_es_insert_extent_check(inode, &newes); ··· 698 699 699 700 newes.es_lblk = lblk; 700 701 newes.es_len = len; 701 - ext4_es_store_pblock(&newes, pblk); 702 - ext4_es_store_status(&newes, status); 702 + ext4_es_store_pblock_status(&newes, pblk, status); 703 703 trace_ext4_es_cache_extent(inode, &newes); 704 704 705 705 if (!len) ··· 810 812 811 813 newes.es_lblk = end + 1; 812 814 newes.es_len = len2; 815 + block = 0x7FDEADBEEF; 813 816 if (ext4_es_is_written(&orig_es) || 814 - ext4_es_is_unwritten(&orig_es)) { 817 + ext4_es_is_unwritten(&orig_es)) 815 818 block = ext4_es_pblock(&orig_es) + 816 819 orig_es.es_len - len2; 817 - ext4_es_store_pblock(&newes, block); 818 - } 819 - ext4_es_store_status(&newes, ext4_es_status(&orig_es)); 820 + ext4_es_store_pblock_status(&newes, block, 821 + ext4_es_status(&orig_es)); 820 822 err = __es_insert_extent(inode, &newes); 821 823 if (err) { 822 824 es->es_lblk = orig_es.es_lblk;

+9

fs/ext4/extents_status.h

··· 129 129 (es->es_pblk & ~ES_MASK)); 130 130 } 131 131 132 + static inline void ext4_es_store_pblock_status(struct extent_status *es, 133 + ext4_fsblk_t pb, 134 + unsigned int status) 135 + { 136 + es->es_pblk = (((ext4_fsblk_t) 137 + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | 138 + (pb & ~ES_MASK)); 139 + } 140 + 132 141 extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 133 142 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 134 143 extern void ext4_es_lru_add(struct inode *inode);

+75 -45

fs/ext4/inode.c

··· 504 504 { 505 505 struct extent_status es; 506 506 int retval; 507 + int ret = 0; 507 508 #ifdef ES_AGGRESSIVE_TEST 508 509 struct ext4_map_blocks orig_map; 509 510 ··· 515 514 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 516 515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 517 516 (unsigned long) map->m_lblk); 517 + 518 + /* 519 + * ext4_map_blocks returns an int, and m_len is an unsigned int 520 + */ 521 + if (unlikely(map->m_len > INT_MAX)) 522 + map->m_len = INT_MAX; 518 523 519 524 /* Lookup extent status tree firstly */ 520 525 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ··· 560 553 EXT4_GET_BLOCKS_KEEP_SIZE); 561 554 } 562 555 if (retval > 0) { 563 - int ret; 564 556 unsigned int status; 565 557 566 558 if (unlikely(retval != map->m_len)) { ··· 586 580 587 581 found: 588 582 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 589 - int ret = check_block_validity(inode, map); 583 + ret = check_block_validity(inode, map); 590 584 if (ret != 0) 591 585 return ret; 592 586 } ··· 603 597 * with buffer head unmapped. 604 598 */ 605 599 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 606 - return retval; 600 + /* 601 + * If we need to convert extent to unwritten 602 + * we continue and do the actual work in 603 + * ext4_ext_map_blocks() 604 + */ 605 + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) 606 + return retval; 607 607 608 608 /* 609 609 * Here we clear m_flags because after allocating an new extent, ··· 665 653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 666 654 667 655 if (retval > 0) { 668 - int ret; 669 656 unsigned int status; 670 657 671 658 if (unlikely(retval != map->m_len)) { ··· 699 688 has_zeroout: 700 689 up_write((&EXT4_I(inode)->i_data_sem)); 701 690 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 702 - int ret = check_block_validity(inode, map); 691 + ret = check_block_validity(inode, map); 703 692 if (ret != 0) 704 693 return ret; 705 694 } ··· 3324 3313 } 3325 3314 3326 3315 /* 3327 - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3328 - * up to the end of the block which corresponds to `from'. 3329 - * This required during truncate. We need to physically zero the tail end 3330 - * of that block so it doesn't yield old data if the file is later grown. 3331 - */ 3332 - int ext4_block_truncate_page(handle_t *handle, 3333 - struct address_space *mapping, loff_t from) 3334 - { 3335 - unsigned offset = from & (PAGE_CACHE_SIZE-1); 3336 - unsigned length; 3337 - unsigned blocksize; 3338 - struct inode *inode = mapping->host; 3339 - 3340 - blocksize = inode->i_sb->s_blocksize; 3341 - length = blocksize - (offset & (blocksize - 1)); 3342 - 3343 - return ext4_block_zero_page_range(handle, mapping, from, length); 3344 - } 3345 - 3346 - /* 3347 3316 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3348 3317 * starting from file offset 'from'. The range to be zero'd must 3349 3318 * be contained with in one block. If the specified range exceeds 3350 3319 * the end of the block it will be shortened to end of the block 3351 3320 * that cooresponds to 'from' 3352 3321 */ 3353 - int ext4_block_zero_page_range(handle_t *handle, 3322 + static int ext4_block_zero_page_range(handle_t *handle, 3354 3323 struct address_space *mapping, loff_t from, loff_t length) 3355 3324 { 3356 3325 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; ··· 3420 3429 return err; 3421 3430 } 3422 3431 3432 + /* 3433 + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3434 + * up to the end of the block which corresponds to `from'. 3435 + * This required during truncate. We need to physically zero the tail end 3436 + * of that block so it doesn't yield old data if the file is later grown. 3437 + */ 3438 + int ext4_block_truncate_page(handle_t *handle, 3439 + struct address_space *mapping, loff_t from) 3440 + { 3441 + unsigned offset = from & (PAGE_CACHE_SIZE-1); 3442 + unsigned length; 3443 + unsigned blocksize; 3444 + struct inode *inode = mapping->host; 3445 + 3446 + blocksize = inode->i_sb->s_blocksize; 3447 + length = blocksize - (offset & (blocksize - 1)); 3448 + 3449 + return ext4_block_zero_page_range(handle, mapping, from, length); 3450 + } 3451 + 3423 3452 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3424 3453 loff_t lstart, loff_t length) 3425 3454 { ··· 3513 3502 if (!S_ISREG(inode->i_mode)) 3514 3503 return -EOPNOTSUPP; 3515 3504 3516 - trace_ext4_punch_hole(inode, offset, length); 3505 + trace_ext4_punch_hole(inode, offset, length, 0); 3517 3506 3518 3507 /* 3519 3508 * Write out all dirty pages to avoid race conditions ··· 3620 3609 up_write(&EXT4_I(inode)->i_data_sem); 3621 3610 if (IS_SYNC(inode)) 3622 3611 ext4_handle_sync(handle); 3612 + 3613 + /* Now release the pages again to reduce race window */ 3614 + if (last_block_offset > first_block_offset) 3615 + truncate_pagecache_range(inode, first_block_offset, 3616 + last_block_offset); 3617 + 3623 3618 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3624 3619 ext4_mark_inode_dirty(handle, inode); 3625 3620 out_stop: ··· 3699 3682 3700 3683 /* 3701 3684 * There is a possibility that we're either freeing the inode 3702 - * or it completely new indode. In those cases we might not 3685 + * or it's a completely new inode. In those cases we might not 3703 3686 * have i_mutex locked because it's not necessary. 3704 3687 */ 3705 3688 if (!(inode->i_state & (I_NEW|I_FREEING))) ··· 3951 3934 new_fl |= S_NOATIME; 3952 3935 if (flags & EXT4_DIRSYNC_FL) 3953 3936 new_fl |= S_DIRSYNC; 3954 - set_mask_bits(&inode->i_flags, 3955 - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); 3937 + inode_set_flags(inode, new_fl, 3938 + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 3956 3939 } 3957 3940 3958 3941 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ ··· 4171 4154 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4172 4155 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4173 4156 4174 - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4175 - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4176 - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4177 - inode->i_version |= 4178 - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4157 + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4158 + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4159 + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4160 + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4161 + inode->i_version |= 4162 + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4163 + } 4179 4164 } 4180 4165 4181 4166 ret = 0; ··· 4347 4328 goto out_brelse; 4348 4329 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4349 4330 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4350 - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4351 - cpu_to_le32(EXT4_OS_HURD)) 4331 + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) 4352 4332 raw_inode->i_file_acl_high = 4353 4333 cpu_to_le16(ei->i_file_acl >> 32); 4354 4334 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ··· 4392 4374 raw_inode->i_block[block] = ei->i_data[block]; 4393 4375 } 4394 4376 4395 - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4396 - if (ei->i_extra_isize) { 4397 - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4398 - raw_inode->i_version_hi = 4399 - cpu_to_le32(inode->i_version >> 32); 4400 - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4377 + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4378 + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4379 + if (ei->i_extra_isize) { 4380 + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4381 + raw_inode->i_version_hi = 4382 + cpu_to_le32(inode->i_version >> 32); 4383 + raw_inode->i_extra_isize = 4384 + cpu_to_le16(ei->i_extra_isize); 4385 + } 4401 4386 } 4402 4387 4403 4388 ext4_inode_csum_set(inode, raw_inode, ei); ··· 4467 4446 return -EIO; 4468 4447 } 4469 4448 4470 - if (wbc->sync_mode != WB_SYNC_ALL) 4449 + /* 4450 + * No need to force transaction in WB_SYNC_NONE mode. Also 4451 + * ext4_sync_fs() will force the commit after everything is 4452 + * written. 4453 + */ 4454 + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) 4471 4455 return 0; 4472 4456 4473 4457 err = ext4_force_commit(inode->i_sb); ··· 4482 4456 err = __ext4_get_inode_loc(inode, &iloc, 0); 4483 4457 if (err) 4484 4458 return err; 4485 - if (wbc->sync_mode == WB_SYNC_ALL) 4459 + /* 4460 + * sync(2) will flush the whole buffer cache. No need to do 4461 + * it here separately for each inode. 4462 + */ 4463 + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 4486 4464 sync_dirty_buffer(iloc.bh); 4487 4465 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 4488 4466 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,

+6 -18

fs/ext4/ioctl.c

··· 104 104 struct ext4_inode_info *ei_bl; 105 105 struct ext4_sb_info *sbi = EXT4_SB(sb); 106 106 107 - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 108 - err = -EINVAL; 109 - goto swap_boot_out; 110 - } 107 + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) 108 + return -EINVAL; 111 109 112 - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { 113 - err = -EPERM; 114 - goto swap_boot_out; 115 - } 110 + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) 111 + return -EPERM; 116 112 117 113 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 118 - if (IS_ERR(inode_bl)) { 119 - err = PTR_ERR(inode_bl); 120 - goto swap_boot_out; 121 - } 114 + if (IS_ERR(inode_bl)) 115 + return PTR_ERR(inode_bl); 122 116 ei_bl = EXT4_I(inode_bl); 123 117 124 118 filemap_flush(inode->i_mapping); ··· 187 193 ext4_mark_inode_dirty(handle, inode); 188 194 } 189 195 } 190 - 191 196 ext4_journal_stop(handle); 192 - 193 197 ext4_double_up_write_data_sem(inode, inode_bl); 194 198 195 199 journal_err_out: 196 200 ext4_inode_resume_unlocked_dio(inode); 197 201 ext4_inode_resume_unlocked_dio(inode_bl); 198 - 199 202 unlock_two_nondirectories(inode, inode_bl); 200 - 201 203 iput(inode_bl); 202 - 203 - swap_boot_out: 204 204 return err; 205 205 } 206 206

+4 -3

fs/ext4/mballoc.c

··· 1808 1808 ext4_lock_group(ac->ac_sb, group); 1809 1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1810 1810 ac->ac_g_ex.fe_len, &ex); 1811 + ex.fe_logical = 0xDEADFA11; /* debug value */ 1811 1812 1812 1813 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1813 1814 ext4_fsblk_t start; ··· 1937 1936 */ 1938 1937 break; 1939 1938 } 1940 - 1939 + ex.fe_logical = 0xDEADC0DE; /* debug value */ 1941 1940 ext4_mb_measure_extent(ac, &ex, e4b); 1942 1941 1943 1942 i += ex.fe_len; ··· 1978 1977 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1979 1978 if (max >= sbi->s_stripe) { 1980 1979 ac->ac_found++; 1980 + ex.fe_logical = 0xDEADF00D; /* debug value */ 1981 1981 ac->ac_b_ex = ex; 1982 1982 ext4_mb_use_best_found(ac, e4b); 1983 1983 break; ··· 4008 4006 (unsigned long)ac->ac_b_ex.fe_len, 4009 4007 (unsigned long)ac->ac_b_ex.fe_logical, 4010 4008 (int)ac->ac_criteria); 4011 - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", 4012 - ac->ac_ex_scanned, ac->ac_found); 4009 + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); 4013 4010 ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); 4014 4011 ngroups = ext4_get_groups_count(sb); 4015 4012 for (i = 0; i < ngroups; i++) {

+1 -3

fs/ext4/mballoc.h

··· 48 48 } \ 49 49 } while (0) 50 50 #else 51 - #define mb_debug(n, fmt, a...) 51 + #define mb_debug(n, fmt, a...) no_printk(fmt, ## a) 52 52 #endif 53 53 54 54 #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ ··· 175 175 /* copy of the best found extent taken before preallocation efforts */ 176 176 struct ext4_free_extent ac_f_ex; 177 177 178 - /* number of iterations done. we have to track to limit searching */ 179 - unsigned long ac_ex_scanned; 180 178 __u16 ac_groups_scanned; 181 179 __u16 ac_found; 182 180 __u16 ac_tail;

+2 -3

fs/ext4/move_extent.c

··· 76 76 * ext4_ext_path structure refers to the last extent, or a negative error 77 77 * value on failure. 78 78 */ 79 - static int 79 + int 80 80 mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 81 81 struct ext4_extent **extent) 82 82 { ··· 861 861 } 862 862 if (!buffer_mapped(bh)) { 863 863 zero_user(page, block_start, blocksize); 864 - if (!err) 865 - set_buffer_uptodate(bh); 864 + set_buffer_uptodate(bh); 866 865 continue; 867 866 } 868 867 }

+31 -9

fs/ext4/super.c

··· 59 59 static struct ext4_lazy_init *ext4_li_info; 60 60 static struct mutex ext4_li_mtx; 61 61 static struct ext4_features *ext4_feat; 62 + static int ext4_mballoc_ready; 62 63 63 64 static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 64 65 unsigned long journal_devnum); ··· 846 845 invalidate_bdev(sbi->journal_bdev); 847 846 ext4_blkdev_remove(sbi); 848 847 } 848 + if (sbi->s_mb_cache) { 849 + ext4_xattr_destroy_cache(sbi->s_mb_cache); 850 + sbi->s_mb_cache = NULL; 851 + } 849 852 if (sbi->s_mmp_tsk) 850 853 kthread_stop(sbi->s_mmp_tsk); 851 854 sb->s_fs_info = NULL; ··· 945 940 inode_init_once(&ei->vfs_inode); 946 941 } 947 942 948 - static int init_inodecache(void) 943 + static int __init init_inodecache(void) 949 944 { 950 945 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 951 946 sizeof(struct ext4_inode_info), ··· 3580 3575 "feature flags set on rev 0 fs, " 3581 3576 "running e2fsck is recommended"); 3582 3577 3578 + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { 3579 + set_opt2(sb, HURD_COMPAT); 3580 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, 3581 + EXT4_FEATURE_INCOMPAT_64BIT)) { 3582 + ext4_msg(sb, KERN_ERR, 3583 + "The Hurd can't support 64-bit file systems"); 3584 + goto failed_mount; 3585 + } 3586 + } 3587 + 3583 3588 if (IS_EXT2_SB(sb)) { 3584 3589 if (ext2_feature_set_ok(sb)) 3585 3590 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " ··· 4025 4010 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); 4026 4011 4027 4012 no_journal: 4013 + if (ext4_mballoc_ready) { 4014 + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); 4015 + if (!sbi->s_mb_cache) { 4016 + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); 4017 + goto failed_mount_wq; 4018 + } 4019 + } 4020 + 4028 4021 /* 4029 4022 * Get the # of file system overhead blocks from the 4030 4023 * superblock if present. ··· 4858 4835 } 4859 4836 4860 4837 if (*flags & MS_RDONLY) { 4838 + err = sync_filesystem(sb); 4839 + if (err < 0) 4840 + goto restore_opts; 4861 4841 err = dquot_suspend(sb, -1); 4862 4842 if (err < 0) 4863 4843 goto restore_opts; ··· 5542 5516 5543 5517 err = ext4_init_mballoc(); 5544 5518 if (err) 5545 - goto out3; 5546 - 5547 - err = ext4_init_xattr(); 5548 - if (err) 5549 5519 goto out2; 5520 + else 5521 + ext4_mballoc_ready = 1; 5550 5522 err = init_inodecache(); 5551 5523 if (err) 5552 5524 goto out1; ··· 5560 5536 unregister_as_ext3(); 5561 5537 destroy_inodecache(); 5562 5538 out1: 5563 - ext4_exit_xattr(); 5564 - out2: 5539 + ext4_mballoc_ready = 0; 5565 5540 ext4_exit_mballoc(); 5566 - out3: 5541 + out2: 5567 5542 ext4_exit_feat_adverts(); 5568 5543 out4: 5569 5544 if (ext4_proc_root) ··· 5585 5562 unregister_as_ext3(); 5586 5563 unregister_filesystem(&ext4_fs_type); 5587 5564 destroy_inodecache(); 5588 - ext4_exit_xattr(); 5589 5565 ext4_exit_mballoc(); 5590 5566 ext4_exit_feat_adverts(); 5591 5567 remove_proc_entry("fs/ext4", NULL);

+32 -27

fs/ext4/xattr.c

··· 81 81 # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 82 82 #endif 83 83 84 - static void ext4_xattr_cache_insert(struct buffer_head *); 84 + static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); 85 85 static struct buffer_head *ext4_xattr_cache_find(struct inode *, 86 86 struct ext4_xattr_header *, 87 87 struct mb_cache_entry **); ··· 89 89 struct ext4_xattr_entry *); 90 90 static int ext4_xattr_list(struct dentry *dentry, char *buffer, 91 91 size_t buffer_size); 92 - 93 - static struct mb_cache *ext4_xattr_cache; 94 92 95 93 static const struct xattr_handler *ext4_xattr_handler_map[] = { 96 94 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, ··· 114 116 #endif 115 117 NULL 116 118 }; 119 + 120 + #define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ 121 + inode->i_sb->s_fs_info)->s_mb_cache) 117 122 118 123 static __le32 ext4_xattr_block_csum(struct inode *inode, 119 124 sector_t block_nr, ··· 266 265 struct ext4_xattr_entry *entry; 267 266 size_t size; 268 267 int error; 268 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 269 269 270 270 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 271 271 name_index, name, buffer, (long)buffer_size); ··· 288 286 error = -EIO; 289 287 goto cleanup; 290 288 } 291 - ext4_xattr_cache_insert(bh); 289 + ext4_xattr_cache_insert(ext4_mb_cache, bh); 292 290 entry = BFIRST(bh); 293 291 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); 294 292 if (error == -EIO) ··· 411 409 struct inode *inode = dentry->d_inode; 412 410 struct buffer_head *bh = NULL; 413 411 int error; 412 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 414 413 415 414 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 416 415 buffer, (long)buffer_size); ··· 433 430 error = -EIO; 434 431 goto cleanup; 435 432 } 436 - ext4_xattr_cache_insert(bh); 433 + ext4_xattr_cache_insert(ext4_mb_cache, bh); 437 434 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); 438 435 439 436 cleanup: ··· 529 526 { 530 527 struct mb_cache_entry *ce = NULL; 531 528 int error = 0; 529 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 532 530 533 - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); 531 + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); 534 532 error = ext4_journal_get_write_access(handle, bh); 535 533 if (error) 536 534 goto out; ··· 571 567 size_t *min_offs, void *base, int *total) 572 568 { 573 569 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { 574 - *total += EXT4_XATTR_LEN(last->e_name_len); 575 570 if (!last->e_value_block && last->e_value_size) { 576 571 size_t offs = le16_to_cpu(last->e_value_offs); 577 572 if (offs < *min_offs) 578 573 *min_offs = offs; 579 574 } 575 + if (total) 576 + *total += EXT4_XATTR_LEN(last->e_name_len); 580 577 } 581 578 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 582 579 } ··· 750 745 struct ext4_xattr_search *s = &bs->s; 751 746 struct mb_cache_entry *ce = NULL; 752 747 int error = 0; 748 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 753 749 754 750 #define header(x) ((struct ext4_xattr_header *)(x)) 755 751 756 752 if (i->value && i->value_len > sb->s_blocksize) 757 753 return -ENOSPC; 758 754 if (s->base) { 759 - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, 755 + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, 760 756 bs->bh->b_blocknr); 761 757 error = ext4_journal_get_write_access(handle, bs->bh); 762 758 if (error) ··· 775 769 if (!IS_LAST_ENTRY(s->first)) 776 770 ext4_xattr_rehash(header(s->base), 777 771 s->here); 778 - ext4_xattr_cache_insert(bs->bh); 772 + ext4_xattr_cache_insert(ext4_mb_cache, 773 + bs->bh); 779 774 } 780 775 unlock_buffer(bs->bh); 781 776 if (error == -EIO) ··· 912 905 memcpy(new_bh->b_data, s->base, new_bh->b_size); 913 906 set_buffer_uptodate(new_bh); 914 907 unlock_buffer(new_bh); 915 - ext4_xattr_cache_insert(new_bh); 908 + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); 916 909 error = ext4_handle_dirty_xattr_block(handle, 917 910 inode, new_bh); 918 911 if (error) ··· 1235 1228 struct ext4_xattr_block_find *bs = NULL; 1236 1229 char *buffer = NULL, *b_entry_name = NULL; 1237 1230 size_t min_offs, free; 1238 - int total_ino, total_blk; 1231 + int total_ino; 1239 1232 void *base, *start, *end; 1240 1233 int extra_isize = 0, error = 0, tried_min_extra_isize = 0; 1241 1234 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); ··· 1293 1286 first = BFIRST(bh); 1294 1287 end = bh->b_data + bh->b_size; 1295 1288 min_offs = end - base; 1296 - free = ext4_xattr_free_space(first, &min_offs, base, 1297 - &total_blk); 1289 + free = ext4_xattr_free_space(first, &min_offs, base, NULL); 1298 1290 if (free < new_extra_isize) { 1299 1291 if (!tried_min_extra_isize && s_min_extra_isize) { 1300 1292 tried_min_extra_isize++; ··· 1501 1495 * Returns 0, or a negative error number on failure. 1502 1496 */ 1503 1497 static void 1504 - ext4_xattr_cache_insert(struct buffer_head *bh) 1498 + ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) 1505 1499 { 1506 1500 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1507 1501 struct mb_cache_entry *ce; 1508 1502 int error; 1509 1503 1510 - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); 1504 + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); 1511 1505 if (!ce) { 1512 1506 ea_bdebug(bh, "out of memory"); 1513 1507 return; ··· 1579 1573 { 1580 1574 __u32 hash = le32_to_cpu(header->h_hash); 1581 1575 struct mb_cache_entry *ce; 1576 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 1582 1577 1583 1578 if (!header->h_hash) 1584 1579 return NULL; /* never share */ 1585 1580 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1586 1581 again: 1587 - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, 1582 + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, 1588 1583 hash); 1589 1584 while (ce) { 1590 1585 struct buffer_head *bh; ··· 1683 1676 1684 1677 #undef BLOCK_HASH_SHIFT 1685 1678 1686 - int __init 1687 - ext4_init_xattr(void) 1679 + #define HASH_BUCKET_BITS 10 1680 + 1681 + struct mb_cache * 1682 + ext4_xattr_create_cache(char *name) 1688 1683 { 1689 - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1690 - if (!ext4_xattr_cache) 1691 - return -ENOMEM; 1692 - return 0; 1684 + return mb_cache_create(name, HASH_BUCKET_BITS); 1693 1685 } 1694 1686 1695 - void 1696 - ext4_exit_xattr(void) 1687 + void ext4_xattr_destroy_cache(struct mb_cache *cache) 1697 1688 { 1698 - if (ext4_xattr_cache) 1699 - mb_cache_destroy(ext4_xattr_cache); 1700 - ext4_xattr_cache = NULL; 1689 + if (cache) 1690 + mb_cache_destroy(cache); 1701 1691 } 1692 +

+3 -3

fs/ext4/xattr.h

··· 110 110 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 111 111 struct ext4_inode *raw_inode, handle_t *handle); 112 112 113 - extern int __init ext4_init_xattr(void); 114 - extern void ext4_exit_xattr(void); 115 - 116 113 extern const struct xattr_handler *ext4_xattr_handlers[]; 117 114 118 115 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, ··· 120 123 extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, 121 124 struct ext4_xattr_info *i, 122 125 struct ext4_xattr_ibody_find *is); 126 + 127 + extern struct mb_cache *ext4_xattr_create_cache(char *name); 128 + extern void ext4_xattr_destroy_cache(struct mb_cache *); 123 129 124 130 #ifdef CONFIG_EXT4_FS_SECURITY 125 131 extern int ext4_init_security(handle_t *handle, struct inode *inode,

+2

fs/f2fs/super.c

··· 568 568 struct f2fs_mount_info org_mount_opt; 569 569 int err, active_logs; 570 570 571 + sync_filesystem(sb); 572 + 571 573 /* 572 574 * Save the old mount options in case we 573 575 * need to restore them.

+2

fs/fat/inode.c

··· 635 635 struct msdos_sb_info *sbi = MSDOS_SB(sb); 636 636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); 637 637 638 + sync_filesystem(sb); 639 + 638 640 /* make sure we update state on remount. */ 639 641 new_rdonly = *flags & MS_RDONLY; 640 642 if (new_rdonly != (sb->s_flags & MS_RDONLY)) {

+1

fs/freevxfs/vxfs_super.c

··· 124 124 125 125 static int vxfs_remount(struct super_block *sb, int *flags, char *data) 126 126 { 127 + sync_filesystem(sb); 127 128 *flags |= MS_RDONLY; 128 129 return 0; 129 130 }

+1

fs/fuse/inode.c

··· 135 135 136 136 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) 137 137 { 138 + sync_filesystem(sb); 138 139 if (*flags & MS_MANDLOCK) 139 140 return -EINVAL; 140 141

+2

fs/gfs2/super.c

··· 1167 1167 struct gfs2_tune *gt = &sdp->sd_tune; 1168 1168 int error; 1169 1169 1170 + sync_filesystem(sb); 1171 + 1170 1172 spin_lock(&gt->gt_spin); 1171 1173 args.ar_commit = gt->gt_logd_secs; 1172 1174 args.ar_quota_quantum = gt->gt_quota_quantum;

+1

fs/hfs/super.c

··· 112 112 113 113 static int hfs_remount(struct super_block *sb, int *flags, char *data) 114 114 { 115 + sync_filesystem(sb); 115 116 *flags |= MS_NODIRATIME; 116 117 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 117 118 return 0;

+1

fs/hfsplus/super.c

··· 323 323 324 324 static int hfsplus_remount(struct super_block *sb, int *flags, char *data) 325 325 { 326 + sync_filesystem(sb); 326 327 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 327 328 return 0; 328 329 if (!(*flags & MS_RDONLY)) {

+2

fs/hpfs/super.c

··· 421 421 struct hpfs_sb_info *sbi = hpfs_sb(s); 422 422 char *new_opts = kstrdup(data, GFP_KERNEL); 423 423 424 + sync_filesystem(s); 425 + 424 426 *flags |= MS_NOATIME; 425 427 426 428 hpfs_lock(s);

+31

fs/inode.c

··· 1898 1898 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); 1899 1899 } 1900 1900 EXPORT_SYMBOL(inode_dio_done); 1901 + 1902 + /* 1903 + * inode_set_flags - atomically set some inode flags 1904 + * 1905 + * Note: the caller should be holding i_mutex, or else be sure that 1906 + * they have exclusive access to the inode structure (i.e., while the 1907 + * inode is being instantiated). The reason for the cmpxchg() loop 1908 + * --- which wouldn't be necessary if all code paths which modify 1909 + * i_flags actually followed this rule, is that there is at least one 1910 + * code path which doesn't today --- for example, 1911 + * __generic_file_aio_write() calls file_remove_suid() without holding 1912 + * i_mutex --- so we use cmpxchg() out of an abundance of caution. 1913 + * 1914 + * In the long run, i_mutex is overkill, and we should probably look 1915 + * at using the i_lock spinlock to protect i_flags, and then make sure 1916 + * it is so documented in include/linux/fs.h and that all code follows 1917 + * the locking convention!! 1918 + */ 1919 + void inode_set_flags(struct inode *inode, unsigned int flags, 1920 + unsigned int mask) 1921 + { 1922 + unsigned int old_flags, new_flags; 1923 + 1924 + WARN_ON_ONCE(flags & ~mask); 1925 + do { 1926 + old_flags = ACCESS_ONCE(inode->i_flags); 1927 + new_flags = (old_flags & ~mask) | flags; 1928 + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, 1929 + new_flags) != old_flags)); 1930 + } 1931 + EXPORT_SYMBOL(inode_set_flags);

+1

fs/isofs/inode.c

··· 117 117 118 118 static int isofs_remount(struct super_block *sb, int *flags, char *data) 119 119 { 120 + sync_filesystem(sb); 120 121 if (!(*flags & MS_RDONLY)) 121 122 return -EROFS; 122 123 return 0;

+38 -39

fs/jbd2/commit.c

··· 555 555 blk_start_plug(&plug); 556 556 jbd2_journal_write_revoke_records(journal, commit_transaction, 557 557 &log_bufs, WRITE_SYNC); 558 - blk_finish_plug(&plug); 559 558 560 559 jbd_debug(3, "JBD2: commit phase 2b\n"); 561 560 ··· 581 582 err = 0; 582 583 bufs = 0; 583 584 descriptor = NULL; 584 - blk_start_plug(&plug); 585 585 while (commit_transaction->t_buffers) { 586 586 587 587 /* Find the next buffer to be journaled... */ ··· 1065 1067 goto restart_loop; 1066 1068 } 1067 1069 1070 + /* Add the transaction to the checkpoint list 1071 + * __journal_remove_checkpoint() can not destroy transaction 1072 + * under us because it is not marked as T_FINISHED yet */ 1073 + if (journal->j_checkpoint_transactions == NULL) { 1074 + journal->j_checkpoint_transactions = commit_transaction; 1075 + commit_transaction->t_cpnext = commit_transaction; 1076 + commit_transaction->t_cpprev = commit_transaction; 1077 + } else { 1078 + commit_transaction->t_cpnext = 1079 + journal->j_checkpoint_transactions; 1080 + commit_transaction->t_cpprev = 1081 + commit_transaction->t_cpnext->t_cpprev; 1082 + commit_transaction->t_cpnext->t_cpprev = 1083 + commit_transaction; 1084 + commit_transaction->t_cpprev->t_cpnext = 1085 + commit_transaction; 1086 + } 1087 + spin_unlock(&journal->j_list_lock); 1088 + 1068 1089 /* Done with this transaction! */ 1069 1090 1070 1091 jbd_debug(3, "JBD2: commit phase 7\n"); ··· 1102 1085 atomic_read(&commit_transaction->t_handle_count); 1103 1086 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1104 1087 commit_transaction->t_tid, &stats.run); 1105 - 1106 - /* 1107 - * Calculate overall stats 1108 - */ 1109 - spin_lock(&journal->j_history_lock); 1110 - journal->j_stats.ts_tid++; 1111 - if (commit_transaction->t_requested) 1112 - journal->j_stats.ts_requested++; 1113 - journal->j_stats.run.rs_wait += stats.run.rs_wait; 1114 - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1115 - journal->j_stats.run.rs_running += stats.run.rs_running; 1116 - journal->j_stats.run.rs_locked += stats.run.rs_locked; 1117 - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1118 - journal->j_stats.run.rs_logging += stats.run.rs_logging; 1119 - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1120 - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1121 - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1122 - spin_unlock(&journal->j_history_lock); 1088 + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1123 1089 1124 1090 commit_transaction->t_state = T_COMMIT_CALLBACK; 1125 1091 J_ASSERT(commit_transaction == journal->j_committing_transaction); ··· 1122 1122 1123 1123 write_unlock(&journal->j_state_lock); 1124 1124 1125 - if (journal->j_checkpoint_transactions == NULL) { 1126 - journal->j_checkpoint_transactions = commit_transaction; 1127 - commit_transaction->t_cpnext = commit_transaction; 1128 - commit_transaction->t_cpprev = commit_transaction; 1129 - } else { 1130 - commit_transaction->t_cpnext = 1131 - journal->j_checkpoint_transactions; 1132 - commit_transaction->t_cpprev = 1133 - commit_transaction->t_cpnext->t_cpprev; 1134 - commit_transaction->t_cpnext->t_cpprev = 1135 - commit_transaction; 1136 - commit_transaction->t_cpprev->t_cpnext = 1137 - commit_transaction; 1138 - } 1139 - spin_unlock(&journal->j_list_lock); 1140 - /* Drop all spin_locks because commit_callback may be block. 1141 - * __journal_remove_checkpoint() can not destroy transaction 1142 - * under us because it is not marked as T_FINISHED yet */ 1143 1125 if (journal->j_commit_callback) 1144 1126 journal->j_commit_callback(journal, commit_transaction); 1145 1127 ··· 1132 1150 write_lock(&journal->j_state_lock); 1133 1151 spin_lock(&journal->j_list_lock); 1134 1152 commit_transaction->t_state = T_FINISHED; 1135 - /* Recheck checkpoint lists after j_list_lock was dropped */ 1153 + /* Check if the transaction can be dropped now that we are finished */ 1136 1154 if (commit_transaction->t_checkpoint_list == NULL && 1137 1155 commit_transaction->t_checkpoint_io_list == NULL) { 1138 1156 __jbd2_journal_drop_transaction(journal, commit_transaction); ··· 1141 1159 spin_unlock(&journal->j_list_lock); 1142 1160 write_unlock(&journal->j_state_lock); 1143 1161 wake_up(&journal->j_wait_done_commit); 1162 + 1163 + /* 1164 + * Calculate overall stats 1165 + */ 1166 + spin_lock(&journal->j_history_lock); 1167 + journal->j_stats.ts_tid++; 1168 + journal->j_stats.ts_requested += stats.ts_requested; 1169 + journal->j_stats.run.rs_wait += stats.run.rs_wait; 1170 + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1171 + journal->j_stats.run.rs_running += stats.run.rs_running; 1172 + journal->j_stats.run.rs_locked += stats.run.rs_locked; 1173 + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1174 + journal->j_stats.run.rs_logging += stats.run.rs_logging; 1175 + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1176 + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1177 + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1178 + spin_unlock(&journal->j_history_lock); 1144 1179 }

+5 -5

fs/jbd2/journal.c

··· 122 122 #endif 123 123 124 124 /* Checksumming functions */ 125 - int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125 + static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 126 126 { 127 127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 128 128 return 1; ··· 143 143 return cpu_to_be32(csum); 144 144 } 145 145 146 - int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146 + static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 147 147 { 148 148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 149 149 return 1; ··· 151 151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 152 152 } 153 153 154 - void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154 + static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 155 155 { 156 156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 157 157 return; ··· 302 302 journal->j_flags |= JBD2_UNMOUNT; 303 303 304 304 while (journal->j_task) { 305 - wake_up(&journal->j_wait_commit); 306 305 write_unlock(&journal->j_state_lock); 306 + wake_up(&journal->j_wait_commit); 307 307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 308 308 write_lock(&journal->j_state_lock); 309 309 } ··· 710 710 while (tid_gt(tid, journal->j_commit_sequence)) { 711 711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", 712 712 tid, journal->j_commit_sequence); 713 - wake_up(&journal->j_wait_commit); 714 713 read_unlock(&journal->j_state_lock); 714 + wake_up(&journal->j_wait_commit); 715 715 wait_event(journal->j_wait_done_commit, 716 716 !tid_gt(tid, journal->j_commit_sequence)); 717 717 read_lock(&journal->j_state_lock);

+22 -24

fs/jbd2/transaction.c

··· 1073 1073 * reused here. 1074 1074 */ 1075 1075 jbd_lock_bh_state(bh); 1076 - spin_lock(&journal->j_list_lock); 1077 1076 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 1078 1077 jh->b_transaction == NULL || 1079 1078 (jh->b_transaction == journal->j_committing_transaction && ··· 1095 1096 jh->b_modified = 0; 1096 1097 1097 1098 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 1099 + spin_lock(&journal->j_list_lock); 1098 1100 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 1099 1101 } else if (jh->b_transaction == journal->j_committing_transaction) { 1100 1102 /* first access by this transaction */ 1101 1103 jh->b_modified = 0; 1102 1104 1103 1105 JBUFFER_TRACE(jh, "set next transaction"); 1106 + spin_lock(&journal->j_list_lock); 1104 1107 jh->b_next_transaction = transaction; 1105 1108 } 1106 1109 spin_unlock(&journal->j_list_lock); ··· 1313 1312 journal->j_running_transaction)) { 1314 1313 printk(KERN_ERR "JBD2: %s: " 1315 1314 "jh->b_transaction (%llu, %p, %u) != " 1316 - "journal->j_running_transaction (%p, %u)", 1315 + "journal->j_running_transaction (%p, %u)\n", 1317 1316 journal->j_devname, 1318 1317 (unsigned long long) bh->b_blocknr, 1319 1318 jh->b_transaction, ··· 1336 1335 */ 1337 1336 if (jh->b_transaction != transaction) { 1338 1337 JBUFFER_TRACE(jh, "already on other transaction"); 1339 - if (unlikely(jh->b_transaction != 1340 - journal->j_committing_transaction)) { 1341 - printk(KERN_ERR "JBD2: %s: " 1342 - "jh->b_transaction (%llu, %p, %u) != " 1343 - "journal->j_committing_transaction (%p, %u)", 1338 + if (unlikely(((jh->b_transaction != 1339 + journal->j_committing_transaction)) || 1340 + (jh->b_next_transaction != transaction))) { 1341 + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " 1342 + "bad jh for block %llu: " 1343 + "transaction (%p, %u), " 1344 + "jh->b_transaction (%p, %u), " 1345 + "jh->b_next_transaction (%p, %u), jlist %u\n", 1344 1346 journal->j_devname, 1345 1347 (unsigned long long) bh->b_blocknr, 1348 + transaction, transaction->t_tid, 1346 1349 jh->b_transaction, 1347 - jh->b_transaction ? jh->b_transaction->t_tid : 0, 1348 - journal->j_committing_transaction, 1349 - journal->j_committing_transaction ? 1350 - journal->j_committing_transaction->t_tid : 0); 1351 - ret = -EINVAL; 1352 - } 1353 - if (unlikely(jh->b_next_transaction != transaction)) { 1354 - printk(KERN_ERR "JBD2: %s: " 1355 - "jh->b_next_transaction (%llu, %p, %u) != " 1356 - "transaction (%p, %u)", 1357 - journal->j_devname, 1358 - (unsigned long long) bh->b_blocknr, 1350 + jh->b_transaction ? 1351 + jh->b_transaction->t_tid : 0, 1359 1352 jh->b_next_transaction, 1360 1353 jh->b_next_transaction ? 1361 1354 jh->b_next_transaction->t_tid : 0, 1362 - transaction, transaction->t_tid); 1355 + jh->b_jlist); 1356 + WARN_ON(1); 1363 1357 ret = -EINVAL; 1364 1358 } 1365 1359 /* And this case is illegal: we can't reuse another ··· 1411 1415 BUFFER_TRACE(bh, "entry"); 1412 1416 1413 1417 jbd_lock_bh_state(bh); 1414 - spin_lock(&journal->j_list_lock); 1415 1418 1416 1419 if (!buffer_jbd(bh)) 1417 1420 goto not_jbd; ··· 1463 1468 * we know to remove the checkpoint after we commit. 1464 1469 */ 1465 1470 1471 + spin_lock(&journal->j_list_lock); 1466 1472 if (jh->b_cp_transaction) { 1467 1473 __jbd2_journal_temp_unlink_buffer(jh); 1468 1474 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); ··· 1476 1480 goto drop; 1477 1481 } 1478 1482 } 1483 + spin_unlock(&journal->j_list_lock); 1479 1484 } else if (jh->b_transaction) { 1480 1485 J_ASSERT_JH(jh, (jh->b_transaction == 1481 1486 journal->j_committing_transaction)); ··· 1488 1491 1489 1492 if (jh->b_next_transaction) { 1490 1493 J_ASSERT(jh->b_next_transaction == transaction); 1494 + spin_lock(&journal->j_list_lock); 1491 1495 jh->b_next_transaction = NULL; 1496 + spin_unlock(&journal->j_list_lock); 1492 1497 1493 1498 /* 1494 1499 * only drop a reference if this transaction modified ··· 1502 1503 } 1503 1504 1504 1505 not_jbd: 1505 - spin_unlock(&journal->j_list_lock); 1506 1506 jbd_unlock_bh_state(bh); 1507 1507 __brelse(bh); 1508 1508 drop: ··· 1819 1821 if (buffer_locked(bh) || buffer_dirty(bh)) 1820 1822 goto out; 1821 1823 1822 - if (jh->b_next_transaction != NULL) 1824 + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) 1823 1825 goto out; 1824 1826 1825 1827 spin_lock(&journal->j_list_lock); 1826 - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1828 + if (jh->b_cp_transaction != NULL) { 1827 1829 /* written-back checkpointed metadata buffer */ 1828 1830 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1829 1831 __jbd2_journal_remove_checkpoint(jh);

+1

fs/jffs2/super.c

··· 243 243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 244 244 int err; 245 245 246 + sync_filesystem(sb); 246 247 err = jffs2_parse_options(c, data); 247 248 if (err) 248 249 return -EINVAL;

+1

fs/jfs/super.c

··· 418 418 int flag = JFS_SBI(sb)->flag; 419 419 int ret; 420 420 421 + sync_filesystem(sb); 421 422 if (!parse_options(data, sb, &newLVSize, &flag)) { 422 423 return -EINVAL; 423 424 }

+386 -154

fs/mbcache.c

··· 26 26 * back on the lru list. 27 27 */ 28 28 29 + /* 30 + * Lock descriptions and usage: 31 + * 32 + * Each hash chain of both the block and index hash tables now contains 33 + * a built-in lock used to serialize accesses to the hash chain. 34 + * 35 + * Accesses to global data structures mb_cache_list and mb_cache_lru_list 36 + * are serialized via the global spinlock mb_cache_spinlock. 37 + * 38 + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize 39 + * accesses to its local data, such as e_used and e_queued. 40 + * 41 + * Lock ordering: 42 + * 43 + * Each block hash chain's lock has the highest lock order, followed by an 44 + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's 45 + * lock), and mb_cach_spinlock, with the lowest order. While holding 46 + * either a block or index hash chain lock, a thread can acquire an 47 + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. 48 + * 49 + * Synchronization: 50 + * 51 + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and 52 + * index hash chian, it needs to lock the corresponding hash chain. For each 53 + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to 54 + * prevent either any simultaneous release or free on the entry and also 55 + * to serialize accesses to either the e_used or e_queued member of the entry. 56 + * 57 + * To avoid having a dangling reference to an already freed 58 + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a 59 + * block hash chain and also no longer being referenced, both e_used, 60 + * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is 61 + * first removed from a block hash chain. 62 + */ 63 + 29 64 #include <linux/kernel.h> 30 65 #include <linux/module.h> 31 66 ··· 69 34 #include <linux/mm.h> 70 35 #include <linux/slab.h> 71 36 #include <linux/sched.h> 72 - #include <linux/init.h> 37 + #include <linux/list_bl.h> 73 38 #include <linux/mbcache.h> 74 - 39 + #include <linux/init.h> 40 + #include <linux/blockgroup_lock.h> 75 41 76 42 #ifdef MB_CACHE_DEBUG 77 43 # define mb_debug(f...) do { \ ··· 93 57 94 58 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 95 59 60 + #define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) 61 + #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ 62 + (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) 63 + 96 64 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); 97 - 65 + static struct blockgroup_lock *mb_cache_bg_lock; 66 + static struct kmem_cache *mb_cache_kmem_cache; 67 + 98 68 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); 99 69 MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); 100 70 MODULE_LICENSE("GPL"); ··· 128 86 static LIST_HEAD(mb_cache_lru_list); 129 87 static DEFINE_SPINLOCK(mb_cache_spinlock); 130 88 89 + static inline void 90 + __spin_lock_mb_cache_entry(struct mb_cache_entry *ce) 91 + { 92 + spin_lock(bgl_lock_ptr(mb_cache_bg_lock, 93 + MB_CACHE_ENTRY_LOCK_INDEX(ce))); 94 + } 95 + 96 + static inline void 97 + __spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) 98 + { 99 + spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, 100 + MB_CACHE_ENTRY_LOCK_INDEX(ce))); 101 + } 102 + 131 103 static inline int 132 - __mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 104 + __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) 133 105 { 134 - return !list_empty(&ce->e_block_list); 106 + return !hlist_bl_unhashed(&ce->e_block_list); 135 107 } 136 108 137 109 138 - static void 139 - __mb_cache_entry_unhash(struct mb_cache_entry *ce) 110 + static inline void 111 + __mb_cache_entry_unhash_block(struct mb_cache_entry *ce) 140 112 { 141 - if (__mb_cache_entry_is_hashed(ce)) { 142 - list_del_init(&ce->e_block_list); 143 - list_del(&ce->e_index.o_list); 144 - } 113 + if (__mb_cache_entry_is_block_hashed(ce)) 114 + hlist_bl_del_init(&ce->e_block_list); 145 115 } 146 116 117 + static inline int 118 + __mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) 119 + { 120 + return !hlist_bl_unhashed(&ce->e_index.o_list); 121 + } 122 + 123 + static inline void 124 + __mb_cache_entry_unhash_index(struct mb_cache_entry *ce) 125 + { 126 + if (__mb_cache_entry_is_index_hashed(ce)) 127 + hlist_bl_del_init(&ce->e_index.o_list); 128 + } 129 + 130 + /* 131 + * __mb_cache_entry_unhash_unlock() 132 + * 133 + * This function is called to unhash both the block and index hash 134 + * chain. 135 + * It assumes both the block and index hash chain is locked upon entry. 136 + * It also unlock both hash chains both exit 137 + */ 138 + static inline void 139 + __mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) 140 + { 141 + __mb_cache_entry_unhash_index(ce); 142 + hlist_bl_unlock(ce->e_index_hash_p); 143 + __mb_cache_entry_unhash_block(ce); 144 + hlist_bl_unlock(ce->e_block_hash_p); 145 + } 147 146 148 147 static void 149 148 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 150 149 { 151 150 struct mb_cache *cache = ce->e_cache; 152 151 153 - mb_assert(!(ce->e_used || ce->e_queued)); 152 + mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); 154 153 kmem_cache_free(cache->c_entry_cache, ce); 155 154 atomic_dec(&cache->c_entry_count); 156 155 } 157 156 158 - 159 157 static void 160 - __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 161 - __releases(mb_cache_spinlock) 158 + __mb_cache_entry_release(struct mb_cache_entry *ce) 162 159 { 160 + /* First lock the entry to serialize access to its local data. */ 161 + __spin_lock_mb_cache_entry(ce); 163 162 /* Wake up all processes queuing for this cache entry. */ 164 163 if (ce->e_queued) 165 164 wake_up_all(&mb_cache_queue); 166 165 if (ce->e_used >= MB_CACHE_WRITER) 167 166 ce->e_used -= MB_CACHE_WRITER; 167 + /* 168 + * Make sure that all cache entries on lru_list have 169 + * both e_used and e_qued of 0s. 170 + */ 168 171 ce->e_used--; 169 - if (!(ce->e_used || ce->e_queued)) { 170 - if (!__mb_cache_entry_is_hashed(ce)) 172 + if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { 173 + if (!__mb_cache_entry_is_block_hashed(ce)) { 174 + __spin_unlock_mb_cache_entry(ce); 171 175 goto forget; 172 - mb_assert(list_empty(&ce->e_lru_list)); 173 - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 176 + } 177 + /* 178 + * Need access to lru list, first drop entry lock, 179 + * then reacquire the lock in the proper order. 180 + */ 181 + spin_lock(&mb_cache_spinlock); 182 + if (list_empty(&ce->e_lru_list)) 183 + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 184 + spin_unlock(&mb_cache_spinlock); 174 185 } 175 - spin_unlock(&mb_cache_spinlock); 186 + __spin_unlock_mb_cache_entry(ce); 176 187 return; 177 188 forget: 178 - spin_unlock(&mb_cache_spinlock); 189 + mb_assert(list_empty(&ce->e_lru_list)); 179 190 __mb_cache_entry_forget(ce, GFP_KERNEL); 180 191 } 181 - 182 192 183 193 /* 184 194 * mb_cache_shrink_scan() memory pressure callback ··· 254 160 255 161 mb_debug("trying to free %d entries", nr_to_scan); 256 162 spin_lock(&mb_cache_spinlock); 257 - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 163 + while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { 258 164 struct mb_cache_entry *ce = 259 165 list_entry(mb_cache_lru_list.next, 260 - struct mb_cache_entry, e_lru_list); 261 - list_move_tail(&ce->e_lru_list, &free_list); 262 - __mb_cache_entry_unhash(ce); 263 - freed++; 166 + struct mb_cache_entry, e_lru_list); 167 + list_del_init(&ce->e_lru_list); 168 + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) 169 + continue; 170 + spin_unlock(&mb_cache_spinlock); 171 + /* Prevent any find or get operation on the entry */ 172 + hlist_bl_lock(ce->e_block_hash_p); 173 + hlist_bl_lock(ce->e_index_hash_p); 174 + /* Ignore if it is touched by a find/get */ 175 + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || 176 + !list_empty(&ce->e_lru_list)) { 177 + hlist_bl_unlock(ce->e_index_hash_p); 178 + hlist_bl_unlock(ce->e_block_hash_p); 179 + spin_lock(&mb_cache_spinlock); 180 + continue; 181 + } 182 + __mb_cache_entry_unhash_unlock(ce); 183 + list_add_tail(&ce->e_lru_list, &free_list); 184 + spin_lock(&mb_cache_spinlock); 264 185 } 265 186 spin_unlock(&mb_cache_spinlock); 187 + 266 188 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 267 189 __mb_cache_entry_forget(entry, gfp_mask); 190 + freed++; 268 191 } 269 192 return freed; 270 193 } ··· 326 215 int n, bucket_count = 1 << bucket_bits; 327 216 struct mb_cache *cache = NULL; 328 217 218 + if (!mb_cache_bg_lock) { 219 + mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), 220 + GFP_KERNEL); 221 + if (!mb_cache_bg_lock) 222 + return NULL; 223 + bgl_lock_init(mb_cache_bg_lock); 224 + } 225 + 329 226 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); 330 227 if (!cache) 331 228 return NULL; 332 229 cache->c_name = name; 333 230 atomic_set(&cache->c_entry_count, 0); 334 231 cache->c_bucket_bits = bucket_bits; 335 - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 336 - GFP_KERNEL); 232 + cache->c_block_hash = kmalloc(bucket_count * 233 + sizeof(struct hlist_bl_head), GFP_KERNEL); 337 234 if (!cache->c_block_hash) 338 235 goto fail; 339 236 for (n=0; n<bucket_count; n++) 340 - INIT_LIST_HEAD(&cache->c_block_hash[n]); 341 - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), 342 - GFP_KERNEL); 237 + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); 238 + cache->c_index_hash = kmalloc(bucket_count * 239 + sizeof(struct hlist_bl_head), GFP_KERNEL); 343 240 if (!cache->c_index_hash) 344 241 goto fail; 345 242 for (n=0; n<bucket_count; n++) 346 - INIT_LIST_HEAD(&cache->c_index_hash[n]); 347 - cache->c_entry_cache = kmem_cache_create(name, 348 - sizeof(struct mb_cache_entry), 0, 349 - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 350 - if (!cache->c_entry_cache) 351 - goto fail2; 243 + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); 244 + if (!mb_cache_kmem_cache) { 245 + mb_cache_kmem_cache = kmem_cache_create(name, 246 + sizeof(struct mb_cache_entry), 0, 247 + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 248 + if (!mb_cache_kmem_cache) 249 + goto fail2; 250 + } 251 + cache->c_entry_cache = mb_cache_kmem_cache; 352 252 353 253 /* 354 254 * Set an upper limit on the number of cache entries so that the hash ··· 395 273 mb_cache_shrink(struct block_device *bdev) 396 274 { 397 275 LIST_HEAD(free_list); 398 - struct list_head *l, *ltmp; 276 + struct list_head *l; 277 + struct mb_cache_entry *ce, *tmp; 399 278 279 + l = &mb_cache_lru_list; 400 280 spin_lock(&mb_cache_spinlock); 401 - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 402 - struct mb_cache_entry *ce = 403 - list_entry(l, struct mb_cache_entry, e_lru_list); 281 + while (!list_is_last(l, &mb_cache_lru_list)) { 282 + l = l->next; 283 + ce = list_entry(l, struct mb_cache_entry, e_lru_list); 404 284 if (ce->e_bdev == bdev) { 405 - list_move_tail(&ce->e_lru_list, &free_list); 406 - __mb_cache_entry_unhash(ce); 285 + list_del_init(&ce->e_lru_list); 286 + if (ce->e_used || ce->e_queued || 287 + atomic_read(&ce->e_refcnt)) 288 + continue; 289 + spin_unlock(&mb_cache_spinlock); 290 + /* 291 + * Prevent any find or get operation on the entry. 292 + */ 293 + hlist_bl_lock(ce->e_block_hash_p); 294 + hlist_bl_lock(ce->e_index_hash_p); 295 + /* Ignore if it is touched by a find/get */ 296 + if (ce->e_used || ce->e_queued || 297 + atomic_read(&ce->e_refcnt) || 298 + !list_empty(&ce->e_lru_list)) { 299 + hlist_bl_unlock(ce->e_index_hash_p); 300 + hlist_bl_unlock(ce->e_block_hash_p); 301 + l = &mb_cache_lru_list; 302 + spin_lock(&mb_cache_spinlock); 303 + continue; 304 + } 305 + __mb_cache_entry_unhash_unlock(ce); 306 + mb_assert(!(ce->e_used || ce->e_queued || 307 + atomic_read(&ce->e_refcnt))); 308 + list_add_tail(&ce->e_lru_list, &free_list); 309 + l = &mb_cache_lru_list; 310 + spin_lock(&mb_cache_spinlock); 407 311 } 408 312 } 409 313 spin_unlock(&mb_cache_spinlock); 410 - list_for_each_safe(l, ltmp, &free_list) { 411 - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 412 - e_lru_list), GFP_KERNEL); 314 + 315 + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { 316 + __mb_cache_entry_forget(ce, GFP_KERNEL); 413 317 } 414 318 } 415 319 ··· 451 303 mb_cache_destroy(struct mb_cache *cache) 452 304 { 453 305 LIST_HEAD(free_list); 454 - struct list_head *l, *ltmp; 306 + struct mb_cache_entry *ce, *tmp; 455 307 456 308 spin_lock(&mb_cache_spinlock); 457 - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 458 - struct mb_cache_entry *ce = 459 - list_entry(l, struct mb_cache_entry, e_lru_list); 460 - if (ce->e_cache == cache) { 309 + list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { 310 + if (ce->e_cache == cache) 461 311 list_move_tail(&ce->e_lru_list, &free_list); 462 - __mb_cache_entry_unhash(ce); 463 - } 464 312 } 465 313 list_del(&cache->c_cache_list); 466 314 spin_unlock(&mb_cache_spinlock); 467 315 468 - list_for_each_safe(l, ltmp, &free_list) { 469 - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 470 - e_lru_list), GFP_KERNEL); 316 + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { 317 + list_del_init(&ce->e_lru_list); 318 + /* 319 + * Prevent any find or get operation on the entry. 320 + */ 321 + hlist_bl_lock(ce->e_block_hash_p); 322 + hlist_bl_lock(ce->e_index_hash_p); 323 + mb_assert(!(ce->e_used || ce->e_queued || 324 + atomic_read(&ce->e_refcnt))); 325 + __mb_cache_entry_unhash_unlock(ce); 326 + __mb_cache_entry_forget(ce, GFP_KERNEL); 471 327 } 472 328 473 329 if (atomic_read(&cache->c_entry_count) > 0) { ··· 480 328 atomic_read(&cache->c_entry_count)); 481 329 } 482 330 483 - kmem_cache_destroy(cache->c_entry_cache); 484 - 331 + if (list_empty(&mb_cache_list)) { 332 + kmem_cache_destroy(mb_cache_kmem_cache); 333 + mb_cache_kmem_cache = NULL; 334 + } 485 335 kfree(cache->c_index_hash); 486 336 kfree(cache->c_block_hash); 487 337 kfree(cache); ··· 500 346 struct mb_cache_entry * 501 347 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 502 348 { 503 - struct mb_cache_entry *ce = NULL; 349 + struct mb_cache_entry *ce; 504 350 505 351 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { 352 + struct list_head *l; 353 + 354 + l = &mb_cache_lru_list; 506 355 spin_lock(&mb_cache_spinlock); 507 - if (!list_empty(&mb_cache_lru_list)) { 508 - ce = list_entry(mb_cache_lru_list.next, 509 - struct mb_cache_entry, e_lru_list); 510 - list_del_init(&ce->e_lru_list); 511 - __mb_cache_entry_unhash(ce); 356 + while (!list_is_last(l, &mb_cache_lru_list)) { 357 + l = l->next; 358 + ce = list_entry(l, struct mb_cache_entry, e_lru_list); 359 + if (ce->e_cache == cache) { 360 + list_del_init(&ce->e_lru_list); 361 + if (ce->e_used || ce->e_queued || 362 + atomic_read(&ce->e_refcnt)) 363 + continue; 364 + spin_unlock(&mb_cache_spinlock); 365 + /* 366 + * Prevent any find or get operation on the 367 + * entry. 368 + */ 369 + hlist_bl_lock(ce->e_block_hash_p); 370 + hlist_bl_lock(ce->e_index_hash_p); 371 + /* Ignore if it is touched by a find/get */ 372 + if (ce->e_used || ce->e_queued || 373 + atomic_read(&ce->e_refcnt) || 374 + !list_empty(&ce->e_lru_list)) { 375 + hlist_bl_unlock(ce->e_index_hash_p); 376 + hlist_bl_unlock(ce->e_block_hash_p); 377 + l = &mb_cache_lru_list; 378 + spin_lock(&mb_cache_spinlock); 379 + continue; 380 + } 381 + mb_assert(list_empty(&ce->e_lru_list)); 382 + mb_assert(!(ce->e_used || ce->e_queued || 383 + atomic_read(&ce->e_refcnt))); 384 + __mb_cache_entry_unhash_unlock(ce); 385 + goto found; 386 + } 512 387 } 513 388 spin_unlock(&mb_cache_spinlock); 514 389 } 515 - if (!ce) { 516 - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 517 - if (!ce) 518 - return NULL; 519 - atomic_inc(&cache->c_entry_count); 520 - INIT_LIST_HEAD(&ce->e_lru_list); 521 - INIT_LIST_HEAD(&ce->e_block_list); 522 - ce->e_cache = cache; 523 - ce->e_queued = 0; 524 - } 390 + 391 + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 392 + if (!ce) 393 + return NULL; 394 + atomic_inc(&cache->c_entry_count); 395 + INIT_LIST_HEAD(&ce->e_lru_list); 396 + INIT_HLIST_BL_NODE(&ce->e_block_list); 397 + INIT_HLIST_BL_NODE(&ce->e_index.o_list); 398 + ce->e_cache = cache; 399 + ce->e_queued = 0; 400 + atomic_set(&ce->e_refcnt, 0); 401 + found: 402 + ce->e_block_hash_p = &cache->c_block_hash[0]; 403 + ce->e_index_hash_p = &cache->c_index_hash[0]; 525 404 ce->e_used = 1 + MB_CACHE_WRITER; 526 405 return ce; 527 406 } ··· 580 393 { 581 394 struct mb_cache *cache = ce->e_cache; 582 395 unsigned int bucket; 583 - struct list_head *l; 584 - int error = -EBUSY; 396 + struct hlist_bl_node *l; 397 + struct hlist_bl_head *block_hash_p; 398 + struct hlist_bl_head *index_hash_p; 399 + struct mb_cache_entry *lce; 585 400 401 + mb_assert(ce); 586 402 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 587 403 cache->c_bucket_bits); 588 - spin_lock(&mb_cache_spinlock); 589 - list_for_each_prev(l, &cache->c_block_hash[bucket]) { 590 - struct mb_cache_entry *ce = 591 - list_entry(l, struct mb_cache_entry, e_block_list); 592 - if (ce->e_bdev == bdev && ce->e_block == block) 593 - goto out; 404 + block_hash_p = &cache->c_block_hash[bucket]; 405 + hlist_bl_lock(block_hash_p); 406 + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { 407 + if (lce->e_bdev == bdev && lce->e_block == block) { 408 + hlist_bl_unlock(block_hash_p); 409 + return -EBUSY; 410 + } 594 411 } 595 - __mb_cache_entry_unhash(ce); 412 + mb_assert(!__mb_cache_entry_is_block_hashed(ce)); 413 + __mb_cache_entry_unhash_block(ce); 414 + __mb_cache_entry_unhash_index(ce); 596 415 ce->e_bdev = bdev; 597 416 ce->e_block = block; 598 - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 417 + ce->e_block_hash_p = block_hash_p; 599 418 ce->e_index.o_key = key; 419 + hlist_bl_add_head(&ce->e_block_list, block_hash_p); 420 + hlist_bl_unlock(block_hash_p); 600 421 bucket = hash_long(key, cache->c_bucket_bits); 601 - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); 602 - error = 0; 603 - out: 604 - spin_unlock(&mb_cache_spinlock); 605 - return error; 422 + index_hash_p = &cache->c_index_hash[bucket]; 423 + hlist_bl_lock(index_hash_p); 424 + ce->e_index_hash_p = index_hash_p; 425 + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); 426 + hlist_bl_unlock(index_hash_p); 427 + return 0; 606 428 } 607 429 608 430 ··· 625 429 void 626 430 mb_cache_entry_release(struct mb_cache_entry *ce) 627 431 { 628 - spin_lock(&mb_cache_spinlock); 629 - __mb_cache_entry_release_unlock(ce); 432 + __mb_cache_entry_release(ce); 630 433 } 631 434 632 435 633 436 /* 634 437 * mb_cache_entry_free() 635 438 * 636 - * This is equivalent to the sequence mb_cache_entry_takeout() -- 637 - * mb_cache_entry_release(). 638 439 */ 639 440 void 640 441 mb_cache_entry_free(struct mb_cache_entry *ce) 641 442 { 642 - spin_lock(&mb_cache_spinlock); 443 + mb_assert(ce); 643 444 mb_assert(list_empty(&ce->e_lru_list)); 644 - __mb_cache_entry_unhash(ce); 645 - __mb_cache_entry_release_unlock(ce); 445 + hlist_bl_lock(ce->e_index_hash_p); 446 + __mb_cache_entry_unhash_index(ce); 447 + hlist_bl_unlock(ce->e_index_hash_p); 448 + hlist_bl_lock(ce->e_block_hash_p); 449 + __mb_cache_entry_unhash_block(ce); 450 + hlist_bl_unlock(ce->e_block_hash_p); 451 + __mb_cache_entry_release(ce); 646 452 } 647 453 648 454 ··· 661 463 sector_t block) 662 464 { 663 465 unsigned int bucket; 664 - struct list_head *l; 466 + struct hlist_bl_node *l; 665 467 struct mb_cache_entry *ce; 468 + struct hlist_bl_head *block_hash_p; 666 469 667 470 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 668 471 cache->c_bucket_bits); 669 - spin_lock(&mb_cache_spinlock); 670 - list_for_each(l, &cache->c_block_hash[bucket]) { 671 - ce = list_entry(l, struct mb_cache_entry, e_block_list); 472 + block_hash_p = &cache->c_block_hash[bucket]; 473 + /* First serialize access to the block corresponding hash chain. */ 474 + hlist_bl_lock(block_hash_p); 475 + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { 476 + mb_assert(ce->e_block_hash_p == block_hash_p); 672 477 if (ce->e_bdev == bdev && ce->e_block == block) { 673 - DEFINE_WAIT(wait); 674 - 675 - if (!list_empty(&ce->e_lru_list)) 676 - list_del_init(&ce->e_lru_list); 677 - 678 - while (ce->e_used > 0) { 679 - ce->e_queued++; 680 - prepare_to_wait(&mb_cache_queue, &wait, 681 - TASK_UNINTERRUPTIBLE); 682 - spin_unlock(&mb_cache_spinlock); 683 - schedule(); 684 - spin_lock(&mb_cache_spinlock); 685 - ce->e_queued--; 478 + /* 479 + * Prevent a free from removing the entry. 480 + */ 481 + atomic_inc(&ce->e_refcnt); 482 + hlist_bl_unlock(block_hash_p); 483 + __spin_lock_mb_cache_entry(ce); 484 + atomic_dec(&ce->e_refcnt); 485 + if (ce->e_used > 0) { 486 + DEFINE_WAIT(wait); 487 + while (ce->e_used > 0) { 488 + ce->e_queued++; 489 + prepare_to_wait(&mb_cache_queue, &wait, 490 + TASK_UNINTERRUPTIBLE); 491 + __spin_unlock_mb_cache_entry(ce); 492 + schedule(); 493 + __spin_lock_mb_cache_entry(ce); 494 + ce->e_queued--; 495 + } 496 + finish_wait(&mb_cache_queue, &wait); 686 497 } 687 - finish_wait(&mb_cache_queue, &wait); 688 498 ce->e_used += 1 + MB_CACHE_WRITER; 499 + __spin_unlock_mb_cache_entry(ce); 689 500 690 - if (!__mb_cache_entry_is_hashed(ce)) { 691 - __mb_cache_entry_release_unlock(ce); 501 + if (!list_empty(&ce->e_lru_list)) { 502 + spin_lock(&mb_cache_spinlock); 503 + list_del_init(&ce->e_lru_list); 504 + spin_unlock(&mb_cache_spinlock); 505 + } 506 + if (!__mb_cache_entry_is_block_hashed(ce)) { 507 + __mb_cache_entry_release(ce); 692 508 return NULL; 693 509 } 694 - goto cleanup; 510 + return ce; 695 511 } 696 512 } 697 - ce = NULL; 698 - 699 - cleanup: 700 - spin_unlock(&mb_cache_spinlock); 701 - return ce; 513 + hlist_bl_unlock(block_hash_p); 514 + return NULL; 702 515 } 703 516 704 517 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 705 518 706 519 static struct mb_cache_entry * 707 - __mb_cache_entry_find(struct list_head *l, struct list_head *head, 520 + __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, 708 521 struct block_device *bdev, unsigned int key) 709 522 { 710 - while (l != head) { 523 + 524 + /* The index hash chain is alredy acquire by caller. */ 525 + while (l != NULL) { 711 526 struct mb_cache_entry *ce = 712 - list_entry(l, struct mb_cache_entry, e_index.o_list); 527 + hlist_bl_entry(l, struct mb_cache_entry, 528 + e_index.o_list); 529 + mb_assert(ce->e_index_hash_p == head); 713 530 if (ce->e_bdev == bdev && ce->e_index.o_key == key) { 714 - DEFINE_WAIT(wait); 715 - 716 - if (!list_empty(&ce->e_lru_list)) 717 - list_del_init(&ce->e_lru_list); 718 - 531 + /* 532 + * Prevent a free from removing the entry. 533 + */ 534 + atomic_inc(&ce->e_refcnt); 535 + hlist_bl_unlock(head); 536 + __spin_lock_mb_cache_entry(ce); 537 + atomic_dec(&ce->e_refcnt); 538 + ce->e_used++; 719 539 /* Incrementing before holding the lock gives readers 720 540 priority over writers. */ 721 - ce->e_used++; 722 - while (ce->e_used >= MB_CACHE_WRITER) { 723 - ce->e_queued++; 724 - prepare_to_wait(&mb_cache_queue, &wait, 725 - TASK_UNINTERRUPTIBLE); 726 - spin_unlock(&mb_cache_spinlock); 727 - schedule(); 728 - spin_lock(&mb_cache_spinlock); 729 - ce->e_queued--; 730 - } 731 - finish_wait(&mb_cache_queue, &wait); 541 + if (ce->e_used >= MB_CACHE_WRITER) { 542 + DEFINE_WAIT(wait); 732 543 733 - if (!__mb_cache_entry_is_hashed(ce)) { 734 - __mb_cache_entry_release_unlock(ce); 544 + while (ce->e_used >= MB_CACHE_WRITER) { 545 + ce->e_queued++; 546 + prepare_to_wait(&mb_cache_queue, &wait, 547 + TASK_UNINTERRUPTIBLE); 548 + __spin_unlock_mb_cache_entry(ce); 549 + schedule(); 550 + __spin_lock_mb_cache_entry(ce); 551 + ce->e_queued--; 552 + } 553 + finish_wait(&mb_cache_queue, &wait); 554 + } 555 + __spin_unlock_mb_cache_entry(ce); 556 + if (!list_empty(&ce->e_lru_list)) { 735 557 spin_lock(&mb_cache_spinlock); 558 + list_del_init(&ce->e_lru_list); 559 + spin_unlock(&mb_cache_spinlock); 560 + } 561 + if (!__mb_cache_entry_is_block_hashed(ce)) { 562 + __mb_cache_entry_release(ce); 736 563 return ERR_PTR(-EAGAIN); 737 564 } 738 565 return ce; 739 566 } 740 567 l = l->next; 741 568 } 569 + hlist_bl_unlock(head); 742 570 return NULL; 743 571 } 744 572 ··· 786 562 unsigned int key) 787 563 { 788 564 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 789 - struct list_head *l; 790 - struct mb_cache_entry *ce; 565 + struct hlist_bl_node *l; 566 + struct mb_cache_entry *ce = NULL; 567 + struct hlist_bl_head *index_hash_p; 791 568 792 - spin_lock(&mb_cache_spinlock); 793 - l = cache->c_index_hash[bucket].next; 794 - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 795 - spin_unlock(&mb_cache_spinlock); 569 + index_hash_p = &cache->c_index_hash[bucket]; 570 + hlist_bl_lock(index_hash_p); 571 + if (!hlist_bl_empty(index_hash_p)) { 572 + l = hlist_bl_first(index_hash_p); 573 + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); 574 + } else 575 + hlist_bl_unlock(index_hash_p); 796 576 return ce; 797 577 } 798 578 ··· 825 597 { 826 598 struct mb_cache *cache = prev->e_cache; 827 599 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 828 - struct list_head *l; 600 + struct hlist_bl_node *l; 829 601 struct mb_cache_entry *ce; 602 + struct hlist_bl_head *index_hash_p; 830 603 831 - spin_lock(&mb_cache_spinlock); 604 + index_hash_p = &cache->c_index_hash[bucket]; 605 + mb_assert(prev->e_index_hash_p == index_hash_p); 606 + hlist_bl_lock(index_hash_p); 607 + mb_assert(!hlist_bl_empty(index_hash_p)); 832 608 l = prev->e_index.o_list.next; 833 - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 834 - __mb_cache_entry_release_unlock(prev); 609 + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); 610 + __mb_cache_entry_release(prev); 835 611 return ce; 836 612 } 837 613

+1

fs/minix/inode.c

··· 123 123 struct minix_sb_info * sbi = minix_sb(sb); 124 124 struct minix_super_block * ms; 125 125 126 + sync_filesystem(sb); 126 127 ms = sbi->s_ms; 127 128 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 128 129 return 0;

+1

fs/ncpfs/inode.c

··· 99 99 100 100 static int ncp_remount(struct super_block *sb, int *flags, char* data) 101 101 { 102 + sync_filesystem(sb); 102 103 *flags |= MS_NODIRATIME; 103 104 return 0; 104 105 }

+2

fs/nfs/super.c

··· 2215 2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; 2216 2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version; 2217 2217 2218 + sync_filesystem(sb); 2219 + 2218 2220 /* 2219 2221 * Userspace mount programs that send binary options generally send 2220 2222 * them populated with default values. We have no way to know which

+1

fs/nilfs2/super.c

··· 1129 1129 unsigned long old_mount_opt; 1130 1130 int err; 1131 1131 1132 + sync_filesystem(sb); 1132 1133 old_sb_flags = sb->s_flags; 1133 1134 old_mount_opt = nilfs->ns_mount_opt; 1134 1135

+2

fs/ntfs/super.c

··· 468 468 469 469 ntfs_debug("Entering with remount options string: %s", opt); 470 470 471 + sync_filesystem(sb); 472 + 471 473 #ifndef NTFS_RW 472 474 /* For read-only compiled driver, enforce read-only flag. */ 473 475 *flags |= MS_RDONLY;

+2

fs/ocfs2/super.c

··· 634 634 struct ocfs2_super *osb = OCFS2_SB(sb); 635 635 u32 tmp; 636 636 637 + sync_filesystem(sb); 638 + 637 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 638 640 !ocfs2_check_set_options(sb, &parsed_options)) { 639 641 ret = -EINVAL;

+1

fs/openpromfs/inode.c

··· 368 368 369 369 static int openprom_remount(struct super_block *sb, int *flags, char *data) 370 370 { 371 + sync_filesystem(sb); 371 372 *flags |= MS_NOATIME; 372 373 return 0; 373 374 }

+2

fs/proc/root.c

··· 92 92 int proc_remount(struct super_block *sb, int *flags, char *data) 93 93 { 94 94 struct pid_namespace *pid = sb->s_fs_info; 95 + 96 + sync_filesystem(sb); 95 97 return !proc_parse_options(data, pid); 96 98 } 97 99

+1

fs/pstore/inode.c

··· 249 249 250 250 static int pstore_remount(struct super_block *sb, int *flags, char *data) 251 251 { 252 + sync_filesystem(sb); 252 253 parse_options(data); 253 254 254 255 return 0;

+1

fs/qnx4/inode.c

··· 44 44 { 45 45 struct qnx4_sb_info *qs; 46 46 47 + sync_filesystem(sb); 47 48 qs = qnx4_sb(sb); 48 49 qs->Version = QNX4_VERSION; 49 50 *flags |= MS_RDONLY;

+1

fs/qnx6/inode.c

··· 55 55 56 56 static int qnx6_remount(struct super_block *sb, int *flags, char *data) 57 57 { 58 + sync_filesystem(sb); 58 59 *flags |= MS_RDONLY; 59 60 return 0; 60 61 }

+1

fs/reiserfs/super.c

··· 1318 1318 int i; 1319 1319 #endif 1320 1320 1321 + sync_filesystem(s); 1321 1322 reiserfs_write_lock(s); 1322 1323 1323 1324 #ifdef CONFIG_QUOTA

+1

fs/romfs/super.c

··· 432 432 */ 433 433 static int romfs_remount(struct super_block *sb, int *flags, char *data) 434 434 { 435 + sync_filesystem(sb); 435 436 *flags |= MS_RDONLY; 436 437 return 0; 437 438 }

+1

fs/squashfs/super.c

··· 371 371 372 372 static int squashfs_remount(struct super_block *sb, int *flags, char *data) 373 373 { 374 + sync_filesystem(sb); 374 375 *flags |= MS_RDONLY; 375 376 return 0; 376 377 }

-2

fs/super.c

··· 719 719 } 720 720 } 721 721 722 - sync_filesystem(sb); 723 - 724 722 if (sb->s_op->remount_fs) { 725 723 retval = sb->s_op->remount_fs(sb, &flags, data); 726 724 if (retval) {

+1

fs/sysv/inode.c

··· 60 60 { 61 61 struct sysv_sb_info *sbi = SYSV_SB(sb); 62 62 63 + sync_filesystem(sb); 63 64 if (sbi->s_forced_ro) 64 65 *flags |= MS_RDONLY; 65 66 return 0;

+1

fs/ubifs/super.c

··· 1827 1827 int err; 1828 1828 struct ubifs_info *c = sb->s_fs_info; 1829 1829 1830 + sync_filesystem(sb); 1830 1831 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); 1831 1832 1832 1833 err = ubifs_parse_options(c, data, 1);

+1

fs/udf/super.c

··· 646 646 int error = 0; 647 647 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); 648 648 649 + sync_filesystem(sb); 649 650 if (lvidiu) { 650 651 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); 651 652 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))

+1

fs/ufs/super.c

··· 1280 1280 unsigned new_mount_opt, ufstype; 1281 1281 unsigned flags; 1282 1282 1283 + sync_filesystem(sb); 1283 1284 lock_ufs(sb); 1284 1285 mutex_lock(&UFS_SB(sb)->s_lock); 1285 1286 uspi = UFS_SB(sb)->s_uspi;

+1

fs/xfs/xfs_super.c

··· 1197 1197 char *p; 1198 1198 int error; 1199 1199 1200 + sync_filesystem(sb); 1200 1201 while ((p = strsep(&options, ",")) != NULL) { 1201 1202 int token; 1202 1203

+3

include/linux/fs.h

··· 2572 2572 void inode_dio_wait(struct inode *inode); 2573 2573 void inode_dio_done(struct inode *inode); 2574 2574 2575 + extern void inode_set_flags(struct inode *inode, unsigned int flags, 2576 + unsigned int mask); 2577 + 2575 2578 extern const struct file_operations generic_ro_fops; 2576 2579 2577 2580 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

+7 -5

include/linux/mbcache.h

··· 3 3 4 4 (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 5 */ 6 - 7 6 struct mb_cache_entry { 8 7 struct list_head e_lru_list; 9 8 struct mb_cache *e_cache; 10 9 unsigned short e_used; 11 10 unsigned short e_queued; 11 + atomic_t e_refcnt; 12 12 struct block_device *e_bdev; 13 13 sector_t e_block; 14 - struct list_head e_block_list; 14 + struct hlist_bl_node e_block_list; 15 15 struct { 16 - struct list_head o_list; 16 + struct hlist_bl_node o_list; 17 17 unsigned int o_key; 18 18 } e_index; 19 + struct hlist_bl_head *e_block_hash_p; 20 + struct hlist_bl_head *e_index_hash_p; 19 21 }; 20 22 21 23 struct mb_cache { ··· 27 25 int c_max_entries; 28 26 int c_bucket_bits; 29 27 struct kmem_cache *c_entry_cache; 30 - struct list_head *c_block_hash; 31 - struct list_head *c_index_hash; 28 + struct hlist_bl_head *c_block_hash; 29 + struct hlist_bl_head *c_index_hash; 32 30 }; 33 31 34 32 /* Functions on caches */

+70 -32

include/trace/events/ext4.h

··· 16 16 struct ext4_map_blocks; 17 17 struct extent_status; 18 18 19 + /* shim until we merge in the xfs_collapse_range branch */ 20 + #ifndef FALLOC_FL_COLLAPSE_RANGE 21 + #define FALLOC_FL_COLLAPSE_RANGE 0x08 22 + #endif 23 + 24 + #ifndef FALLOC_FL_ZERO_RANGE 25 + #define FALLOC_FL_ZERO_RANGE 0x10 26 + #endif 27 + 19 28 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 20 29 21 30 #define show_mballoc_flags(flags) __print_flags(flags, "|", \ ··· 76 67 { EXTENT_STATUS_UNWRITTEN, "U" }, \ 77 68 { EXTENT_STATUS_DELAYED, "D" }, \ 78 69 { EXTENT_STATUS_HOLE, "H" }) 70 + 71 + #define show_falloc_mode(mode) __print_flags(mode, "|", \ 72 + { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ 73 + { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ 74 + { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \ 75 + { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ 76 + { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) 79 77 80 78 81 79 TRACE_EVENT(ext4_free_inode, ··· 1344 1328 __entry->rw, __entry->ret) 1345 1329 ); 1346 1330 1347 - TRACE_EVENT(ext4_fallocate_enter, 1331 + DECLARE_EVENT_CLASS(ext4__fallocate_mode, 1348 1332 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1349 1333 1350 1334 TP_ARGS(inode, offset, len, mode), ··· 1352 1336 TP_STRUCT__entry( 1353 1337 __field( dev_t, dev ) 1354 1338 __field( ino_t, ino ) 1355 - __field( loff_t, pos ) 1356 - __field( loff_t, len ) 1339 + __field( loff_t, offset ) 1340 + __field( loff_t, len ) 1357 1341 __field( int, mode ) 1358 1342 ), 1359 1343 1360 1344 TP_fast_assign( 1361 1345 __entry->dev = inode->i_sb->s_dev; 1362 1346 __entry->ino = inode->i_ino; 1363 - __entry->pos = offset; 1347 + __entry->offset = offset; 1364 1348 __entry->len = len; 1365 1349 __entry->mode = mode; 1366 1350 ), 1367 1351 1368 - TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", 1352 + TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", 1369 1353 MAJOR(__entry->dev), MINOR(__entry->dev), 1370 - (unsigned long) __entry->ino, __entry->pos, 1371 - __entry->len, __entry->mode) 1354 + (unsigned long) __entry->ino, 1355 + __entry->offset, __entry->len, 1356 + show_falloc_mode(__entry->mode)) 1357 + ); 1358 + 1359 + DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, 1360 + 1361 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1362 + 1363 + TP_ARGS(inode, offset, len, mode) 1364 + ); 1365 + 1366 + DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, 1367 + 1368 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1369 + 1370 + TP_ARGS(inode, offset, len, mode) 1371 + ); 1372 + 1373 + DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, 1374 + 1375 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), 1376 + 1377 + TP_ARGS(inode, offset, len, mode) 1372 1378 ); 1373 1379 1374 1380 TRACE_EVENT(ext4_fallocate_exit, ··· 1420 1382 (unsigned long) __entry->ino, 1421 1383 __entry->pos, __entry->blocks, 1422 1384 __entry->ret) 1423 - ); 1424 - 1425 - TRACE_EVENT(ext4_punch_hole, 1426 - TP_PROTO(struct inode *inode, loff_t offset, loff_t len), 1427 - 1428 - TP_ARGS(inode, offset, len), 1429 - 1430 - TP_STRUCT__entry( 1431 - __field( dev_t, dev ) 1432 - __field( ino_t, ino ) 1433 - __field( loff_t, offset ) 1434 - __field( loff_t, len ) 1435 - ), 1436 - 1437 - TP_fast_assign( 1438 - __entry->dev = inode->i_sb->s_dev; 1439 - __entry->ino = inode->i_ino; 1440 - __entry->offset = offset; 1441 - __entry->len = len; 1442 - ), 1443 - 1444 - TP_printk("dev %d,%d ino %lu offset %lld len %lld", 1445 - MAJOR(__entry->dev), MINOR(__entry->dev), 1446 - (unsigned long) __entry->ino, 1447 - __entry->offset, __entry->len) 1448 1385 ); 1449 1386 1450 1387 TRACE_EVENT(ext4_unlink_enter, ··· 2421 2408 TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d", 2422 2409 MAJOR(__entry->dev), MINOR(__entry->dev), 2423 2410 __entry->shrunk_nr, __entry->cache_cnt) 2411 + ); 2412 + 2413 + TRACE_EVENT(ext4_collapse_range, 2414 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len), 2415 + 2416 + TP_ARGS(inode, offset, len), 2417 + 2418 + TP_STRUCT__entry( 2419 + __field(dev_t, dev) 2420 + __field(ino_t, ino) 2421 + __field(loff_t, offset) 2422 + __field(loff_t, len) 2423 + ), 2424 + 2425 + TP_fast_assign( 2426 + __entry->dev = inode->i_sb->s_dev; 2427 + __entry->ino = inode->i_ino; 2428 + __entry->offset = offset; 2429 + __entry->len = len; 2430 + ), 2431 + 2432 + TP_printk("dev %d,%d ino %lu offset %lld len %lld", 2433 + MAJOR(__entry->dev), MINOR(__entry->dev), 2434 + (unsigned long) __entry->ino, 2435 + __entry->offset, __entry->len) 2424 2436 ); 2425 2437 2426 2438 #endif /* _TRACE_EXT4_H */