Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: always pin metadata in discard mode
Btrfs: enable discard support
Btrfs: add -o discard option
Btrfs: properly wait log writers during log sync
Btrfs: fix possible ENOSPC problems with truncate
Btrfs: fix btrfs acl #ifdef checks
Btrfs: streamline tree-log btree block writeout
Btrfs: avoid tree log commit when there are no changes
Btrfs: only write one super copy during fsync

+179 -42
+3 -3
fs/btrfs/acl.c
··· 27 27 #include "btrfs_inode.h" 28 28 #include "xattr.h" 29 29 30 - #ifdef CONFIG_BTRFS_POSIX_ACL 30 + #ifdef CONFIG_BTRFS_FS_POSIX_ACL 31 31 32 32 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 33 33 { ··· 313 313 .set = btrfs_xattr_acl_access_set, 314 314 }; 315 315 316 - #else /* CONFIG_BTRFS_POSIX_ACL */ 316 + #else /* CONFIG_BTRFS_FS_POSIX_ACL */ 317 317 318 318 int btrfs_acl_chmod(struct inode *inode) 319 319 { ··· 325 325 return 0; 326 326 } 327 327 328 - #endif /* CONFIG_BTRFS_POSIX_ACL */ 328 + #endif /* CONFIG_BTRFS_FS_POSIX_ACL */
+6
fs/btrfs/btrfs_inode.h
··· 86 86 * transid of the trans_handle that last modified this inode 87 87 */ 88 88 u64 last_trans; 89 + 90 + /* 91 + * log transid when this inode was last modified 92 + */ 93 + u64 last_sub_trans; 94 + 89 95 /* 90 96 * transid that last logged this inode 91 97 */
+3 -1
fs/btrfs/ctree.h
··· 1009 1009 atomic_t log_writers; 1010 1010 atomic_t log_commit[2]; 1011 1011 unsigned long log_transid; 1012 + unsigned long last_log_commit; 1012 1013 unsigned long log_batch; 1013 1014 pid_t log_start_pid; 1014 1015 bool log_multiple_pids; ··· 1153 1152 #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1154 1153 #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1155 1154 #define BTRFS_MOUNT_NOSSD (1 << 9) 1155 + #define BTRFS_MOUNT_DISCARD (1 << 10) 1156 1156 1157 1157 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1158 1158 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) ··· 2375 2373 int btrfs_sync_fs(struct super_block *sb, int wait); 2376 2374 2377 2375 /* acl.c */ 2378 - #ifdef CONFIG_BTRFS_POSIX_ACL 2376 + #ifdef CONFIG_BTRFS_FS_POSIX_ACL 2379 2377 int btrfs_check_acl(struct inode *inode, int mask); 2380 2378 #else 2381 2379 #define btrfs_check_acl NULL
+2
fs/btrfs/disk-io.c
··· 917 917 atomic_set(&root->log_writers, 0); 918 918 root->log_batch = 0; 919 919 root->log_transid = 0; 920 + root->last_log_commit = 0; 920 921 extent_io_tree_init(&root->dirty_log_pages, 921 922 fs_info->btree_inode->i_mapping, GFP_NOFS); 922 923 ··· 1088 1087 WARN_ON(root->log_root); 1089 1088 root->log_root = log_root; 1090 1089 root->log_transid = 0; 1090 + root->last_log_commit = 0; 1091 1091 return 0; 1092 1092 } 1093 1093
+11 -6
fs/btrfs/extent-tree.c
··· 1568 1568 return ret; 1569 1569 } 1570 1570 1571 - #ifdef BIO_RW_DISCARD 1572 1571 static void btrfs_issue_discard(struct block_device *bdev, 1573 1572 u64 start, u64 len) 1574 1573 { 1575 1574 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1576 1575 DISCARD_FL_BARRIER); 1577 1576 } 1578 - #endif 1579 1577 1580 1578 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1581 1579 u64 num_bytes) 1582 1580 { 1583 - #ifdef BIO_RW_DISCARD 1584 1581 int ret; 1585 1582 u64 map_length = num_bytes; 1586 1583 struct btrfs_multi_bio *multi = NULL; 1584 + 1585 + if (!btrfs_test_opt(root, DISCARD)) 1586 + return 0; 1587 1587 1588 1588 /* Tell the block device(s) that the sectors can be discarded */ 1589 1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, ··· 1604 1604 } 1605 1605 1606 1606 return ret; 1607 - #else 1608 - return 0; 1609 - #endif 1610 1607 } 1611 1608 1612 1609 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ··· 3685 3688 struct extent_buffer *buf; 3686 3689 3687 3690 if (is_data) 3691 + goto pinit; 3692 + 3693 + /* 3694 + * discard is sloooow, and so triggering discards on 3695 + * individual btree blocks isn't a good plan. Just 3696 + * pin everything in discard mode. 3697 + */ 3698 + if (btrfs_test_opt(root, DISCARD)) 3688 3699 goto pinit; 3689 3700 3690 3701 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+26 -15
fs/btrfs/file.c
··· 1086 1086 btrfs_end_transaction(trans, root); 1087 1087 else 1088 1088 btrfs_commit_transaction(trans, root); 1089 - } else { 1089 + } else if (ret != BTRFS_NO_LOG_SYNC) { 1090 1090 btrfs_commit_transaction(trans, root); 1091 + } else { 1092 + btrfs_end_transaction(trans, root); 1091 1093 } 1092 1094 } 1093 1095 if (file->f_flags & O_DIRECT) { ··· 1139 1137 int ret = 0; 1140 1138 struct btrfs_trans_handle *trans; 1141 1139 1140 + 1141 + /* we wait first, since the writeback may change the inode */ 1142 + root->log_batch++; 1143 + /* the VFS called filemap_fdatawrite for us */ 1144 + btrfs_wait_ordered_range(inode, 0, (u64)-1); 1145 + root->log_batch++; 1146 + 1142 1147 /* 1143 1148 * check the transaction that last modified this inode 1144 1149 * and see if its already been committed ··· 1153 1144 if (!BTRFS_I(inode)->last_trans) 1154 1145 goto out; 1155 1146 1147 + /* 1148 + * if the last transaction that changed this file was before 1149 + * the current transaction, we can bail out now without any 1150 + * syncing 1151 + */ 1156 1152 mutex_lock(&root->fs_info->trans_mutex); 1157 1153 if (BTRFS_I(inode)->last_trans <= 1158 1154 root->fs_info->last_trans_committed) { ··· 1167 1153 } 1168 1154 mutex_unlock(&root->fs_info->trans_mutex); 1169 1155 1170 - root->log_batch++; 1171 - filemap_fdatawrite(inode->i_mapping); 1172 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 1173 - root->log_batch++; 1174 - 1175 - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) 1176 - goto out; 1177 1156 /* 1178 1157 * ok we haven't committed the transaction yet, lets do a commit 1179 1158 */ ··· 1195 1188 */ 1196 1189 mutex_unlock(&dentry->d_inode->i_mutex); 1197 1190 1198 - if (ret > 0) { 1199 - ret = btrfs_commit_transaction(trans, root); 1200 - } else { 1201 - ret = btrfs_sync_log(trans, root); 1202 - if (ret == 0) 1203 - ret = btrfs_end_transaction(trans, root); 1204 - else 1191 + if (ret != BTRFS_NO_LOG_SYNC) { 1192 + if (ret > 0) { 1205 1193 ret = btrfs_commit_transaction(trans, root); 1194 + } else { 1195 + ret = btrfs_sync_log(trans, root); 1196 + if (ret == 0) 1197 + ret = btrfs_end_transaction(trans, root); 1198 + else 1199 + ret = btrfs_commit_transaction(trans, root); 1200 + } 1201 + } else { 1202 + ret = btrfs_end_transaction(trans, root); 1206 1203 } 1207 1204 mutex_lock(&dentry->d_inode->i_mutex); 1208 1205 out:
+29 -4
fs/btrfs/inode.c
··· 3032 3032 3033 3033 if ((offset & (blocksize - 1)) == 0) 3034 3034 goto out; 3035 + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3036 + if (ret) 3037 + goto out; 3038 + 3039 + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); 3040 + if (ret) 3041 + goto out; 3035 3042 3036 3043 ret = -ENOMEM; 3037 3044 again: 3038 3045 page = grab_cache_page(mapping, index); 3039 - if (!page) 3046 + if (!page) { 3047 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3048 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 3040 3049 goto out; 3050 + } 3041 3051 3042 3052 page_start = page_offset(page); 3043 3053 page_end = page_start + PAGE_CACHE_SIZE - 1; ··· 3080 3070 goto again; 3081 3071 } 3082 3072 3073 + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3074 + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3075 + GFP_NOFS); 3076 + 3083 3077 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3084 3078 if (ret) { 3085 3079 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); ··· 3102 3088 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3103 3089 3104 3090 out_unlock: 3091 + if (ret) 3092 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3093 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 3105 3094 unlock_page(page); 3106 3095 page_cache_release(page); 3107 3096 out: ··· 3128 3111 if (size <= hole_start) 3129 3112 return 0; 3130 3113 3131 - btrfs_truncate_page(inode->i_mapping, inode->i_size); 3114 + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); 3115 + if (err) 3116 + return err; 3132 3117 3133 3118 while (1) { 3134 3119 struct btrfs_ordered_extent *ordered; ··· 3499 3480 bi->generation = 0; 3500 3481 bi->sequence = 0; 3501 3482 bi->last_trans = 0; 3483 + bi->last_sub_trans = 0; 3502 3484 bi->logged_trans = 0; 3503 3485 bi->delalloc_bytes = 0; 3504 3486 bi->reserved_bytes = 0; ··· 5000 4980 set_page_dirty(page); 5001 4981 SetPageUptodate(page); 5002 4982 5003 - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 4983 + BTRFS_I(inode)->last_trans = root->fs_info->generation; 4984 + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 4985 + 5004 4986 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5005 4987 5006 4988 out_unlock: ··· 5027 5005 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5028 5006 return; 5029 5007 5030 - btrfs_truncate_page(inode->i_mapping, inode->i_size); 5008 + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 5009 + if (ret) 5010 + return; 5031 5011 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 5032 5012 5033 5013 trans = btrfs_start_transaction(root, 1); ··· 5124 5100 if (!ei) 5125 5101 return NULL; 5126 5102 ei->last_trans = 0; 5103 + ei->last_sub_trans = 0; 5127 5104 ei->logged_trans = 0; 5128 5105 ei->outstanding_extents = 0; 5129 5106 ei->reserved_extents = 0;
+7 -2
fs/btrfs/super.c
··· 66 66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 69 - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, 69 + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 70 + Opt_discard, Opt_err, 70 71 }; 71 72 72 73 static match_table_t tokens = { ··· 89 88 {Opt_notreelog, "notreelog"}, 90 89 {Opt_flushoncommit, "flushoncommit"}, 91 90 {Opt_ratio, "metadata_ratio=%d"}, 91 + {Opt_discard, "discard"}, 92 92 {Opt_err, NULL}, 93 93 }; 94 94 ··· 259 257 info->metadata_ratio); 260 258 } 261 259 break; 260 + case Opt_discard: 261 + btrfs_set_opt(info->mount_opt, DISCARD); 262 + break; 262 263 default: 263 264 break; 264 265 } ··· 349 344 sb->s_export_op = &btrfs_export_ops; 350 345 sb->s_xattr = btrfs_xattr_handlers; 351 346 sb->s_time_gran = 1; 352 - #ifdef CONFIG_BTRFS_POSIX_ACL 347 + #ifdef CONFIG_BTRFS_FS_POSIX_ACL 353 348 sb->s_flags |= MS_POSIXACL; 354 349 #endif 355 350
+42 -3
fs/btrfs/transaction.c
··· 344 344 /* 345 345 * when btree blocks are allocated, they have some corresponding bits set for 346 346 * them in one of two extent_io trees. This is used to make sure all of 347 - * those extents are on disk for transaction or log commit 347 + * those extents are sent to disk but does not wait on them 348 348 */ 349 - int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 350 - struct extent_io_tree *dirty_pages) 349 + int btrfs_write_marked_extents(struct btrfs_root *root, 350 + struct extent_io_tree *dirty_pages) 351 351 { 352 352 int ret; 353 353 int err = 0; ··· 394 394 page_cache_release(page); 395 395 } 396 396 } 397 + if (err) 398 + werr = err; 399 + return werr; 400 + } 401 + 402 + /* 403 + * when btree blocks are allocated, they have some corresponding bits set for 404 + * them in one of two extent_io trees. This is used to make sure all of 405 + * those extents are on disk for transaction or log commit. We wait 406 + * on all the pages and clear them from the dirty pages state tree 407 + */ 408 + int btrfs_wait_marked_extents(struct btrfs_root *root, 409 + struct extent_io_tree *dirty_pages) 410 + { 411 + int ret; 412 + int err = 0; 413 + int werr = 0; 414 + struct page *page; 415 + struct inode *btree_inode = root->fs_info->btree_inode; 416 + u64 start = 0; 417 + u64 end; 418 + unsigned long index; 419 + 397 420 while (1) { 398 421 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 399 422 EXTENT_DIRTY); ··· 445 422 if (err) 446 423 werr = err; 447 424 return werr; 425 + } 426 + 427 + /* 428 + * when btree blocks are allocated, they have some corresponding bits set for 429 + * them in one of two extent_io trees. This is used to make sure all of 430 + * those extents are on disk for transaction or log commit 431 + */ 432 + int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 433 + struct extent_io_tree *dirty_pages) 434 + { 435 + int ret; 436 + int ret2; 437 + 438 + ret = btrfs_write_marked_extents(root, dirty_pages); 439 + ret2 = btrfs_wait_marked_extents(root, dirty_pages); 440 + return ret || ret2; 448 441 } 449 442 450 443 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+5
fs/btrfs/transaction.h
··· 79 79 struct inode *inode) 80 80 { 81 81 BTRFS_I(inode)->last_trans = trans->transaction->transid; 82 + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 82 83 } 83 84 84 85 int btrfs_end_transaction(struct btrfs_trans_handle *trans, ··· 107 106 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 108 107 struct btrfs_root *root); 109 108 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 109 + struct extent_io_tree *dirty_pages); 110 + int btrfs_write_marked_extents(struct btrfs_root *root, 111 + struct extent_io_tree *dirty_pages); 112 + int btrfs_wait_marked_extents(struct btrfs_root *root, 110 113 struct extent_io_tree *dirty_pages); 111 114 int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 112 115 #endif
+41 -7
fs/btrfs/tree-log.c
··· 1980 1980 int ret; 1981 1981 struct btrfs_root *log = root->log_root; 1982 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1983 + u64 log_transid = 0; 1983 1984 1984 1985 mutex_lock(&root->log_mutex); 1985 1986 index1 = root->log_transid % 2; ··· 1995 1994 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1996 1995 wait_log_commit(trans, root, root->log_transid - 1); 1997 1996 1998 - while (root->log_multiple_pids) { 1997 + while (1) { 1999 1998 unsigned long batch = root->log_batch; 2000 - mutex_unlock(&root->log_mutex); 2001 - schedule_timeout_uninterruptible(1); 2002 - mutex_lock(&root->log_mutex); 2003 - 1999 + if (root->log_multiple_pids) { 2000 + mutex_unlock(&root->log_mutex); 2001 + schedule_timeout_uninterruptible(1); 2002 + mutex_lock(&root->log_mutex); 2003 + } 2004 2004 wait_for_writer(trans, root); 2005 2005 if (batch == root->log_batch) 2006 2006 break; ··· 2014 2012 goto out; 2015 2013 } 2016 2014 2017 - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2015 + /* we start IO on all the marked extents here, but we don't actually 2016 + * wait for them until later. 2017 + */ 2018 + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); 2018 2019 BUG_ON(ret); 2019 2020 2020 2021 btrfs_set_root_node(&log->root_item, log->node); 2021 2022 2022 2023 root->log_batch = 0; 2024 + log_transid = root->log_transid; 2023 2025 root->log_transid++; 2024 2026 log->log_transid = root->log_transid; 2025 2027 root->log_start_pid = 0; ··· 2052 2046 2053 2047 index2 = log_root_tree->log_transid % 2; 2054 2048 if (atomic_read(&log_root_tree->log_commit[index2])) { 2049 + btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2055 2050 wait_log_commit(trans, log_root_tree, 2056 2051 log_root_tree->log_transid); 2057 2052 mutex_unlock(&log_root_tree->log_mutex); ··· 2072 2065 * check the full commit flag again 2073 2066 */ 2074 2067 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2068 + btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2075 2069 mutex_unlock(&log_root_tree->log_mutex); 2076 2070 ret = -EAGAIN; 2077 2071 goto out_wake_log_root; ··· 2081 2073 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2082 2074 &log_root_tree->dirty_log_pages); 2083 2075 BUG_ON(ret); 2076 + btrfs_wait_marked_extents(log, &log->dirty_log_pages); 2084 2077 2085 2078 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2086 2079 log_root_tree->node->start); ··· 2101 2092 * the running transaction open, so a full commit can't hop 2102 2093 * in and cause problems either. 2103 2094 */ 2104 - write_ctree_super(trans, root->fs_info->tree_root, 2); 2095 + write_ctree_super(trans, root->fs_info->tree_root, 1); 2105 2096 ret = 0; 2097 + 2098 + mutex_lock(&root->log_mutex); 2099 + if (root->last_log_commit < log_transid) 2100 + root->last_log_commit = log_transid; 2101 + mutex_unlock(&root->log_mutex); 2106 2102 2107 2103 out_wake_log_root: 2108 2104 atomic_set(&log_root_tree->log_commit[index2], 0); ··· 2876 2862 return ret; 2877 2863 } 2878 2864 2865 + static int inode_in_log(struct btrfs_trans_handle *trans, 2866 + struct inode *inode) 2867 + { 2868 + struct btrfs_root *root = BTRFS_I(inode)->root; 2869 + int ret = 0; 2870 + 2871 + mutex_lock(&root->log_mutex); 2872 + if (BTRFS_I(inode)->logged_trans == trans->transid && 2873 + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2874 + ret = 1; 2875 + mutex_unlock(&root->log_mutex); 2876 + return ret; 2877 + } 2878 + 2879 + 2879 2880 /* 2880 2881 * helper function around btrfs_log_inode to make sure newly created 2881 2882 * parent directories also end up in the log. A minimal inode and backref ··· 2929 2900 sb, last_committed); 2930 2901 if (ret) 2931 2902 goto end_no_trans; 2903 + 2904 + if (inode_in_log(trans, inode)) { 2905 + ret = BTRFS_NO_LOG_SYNC; 2906 + goto end_no_trans; 2907 + } 2932 2908 2933 2909 start_log_trans(trans, root); 2934 2910
+3
fs/btrfs/tree-log.h
··· 19 19 #ifndef __TREE_LOG_ 20 20 #define __TREE_LOG_ 21 21 22 + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 23 + #define BTRFS_NO_LOG_SYNC 256 24 + 22 25 int btrfs_sync_log(struct btrfs_trans_handle *trans, 23 26 struct btrfs_root *root); 24 27 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+1 -1
fs/btrfs/xattr.c
··· 260 260 * attributes are handled directly. 261 261 */ 262 262 struct xattr_handler *btrfs_xattr_handlers[] = { 263 - #ifdef CONFIG_BTRFS_POSIX_ACL 263 + #ifdef CONFIG_BTRFS_FS_POSIX_ACL 264 264 &btrfs_xattr_acl_access_handler, 265 265 &btrfs_xattr_acl_default_handler, 266 266 #endif