Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs updates from Chris Mason:
"Outside of our usual batch of fixes, this integrates the subvolume
quota updates that Qu Wenruo from Fujitsu has been working on for a
few releases now. He gets an extra gold star for making btrfs smaller
this time, and fixing a number of quota corners in the process.

Dave Sterba tested and integrated Anand Jain's sysfs improvements.
Outside of exporting a symbol (ack'd by Greg) these are all internal
to btrfs and it's mostly cleanups and fixes. Anand also attached some
of our sysfs objects to our internal device management structs instead
of an object off the super block. It will make device management
easier overall and it's a better fit for how the sysfs files are used.
None of the existing sysfs files are moved around.

Thanks for all the fixes everyone"

* 'for-linus-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (87 commits)
btrfs: delayed-ref: double free in btrfs_add_delayed_tree_ref()
Btrfs: Check if kobject is initialized before put
lib: export symbol kobject_move()
Btrfs: sysfs: add support to show replacing target in the sysfs
Btrfs: free the stale device
Btrfs: use received_uuid of parent during send
Btrfs: fix use-after-free in btrfs_replay_log
btrfs: wait for delayed iputs on no space
btrfs: qgroup: Make snapshot accounting work with new extent-oriented qgroup.
btrfs: qgroup: Add the ability to skip given qgroup for old/new_roots.
btrfs: ulist: Add ulist_del() function.
btrfs: qgroup: Cleanup the old ref_node-oriented mechanism.
btrfs: qgroup: Switch self test to extent-oriented qgroup mechanism.
btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.
btrfs: qgroup: Switch rescan to new mechanism.
btrfs: qgroup: Add new qgroup calculation function btrfs_qgroup_account_extents().
btrfs: backref: Add special time_seq == (u64)-1 case for btrfs_find_all_roots().
btrfs: qgroup: Add new function to record old_roots.
btrfs: qgroup: Record possible quota-related extent for qgroup.
btrfs: qgroup: Add function qgroup_update_counters().
...

+1725 -1739
+1
fs/btrfs/async-thread.c
··· 85 85 BTRFS_WORK_HELPER(scrub_helper); 86 86 BTRFS_WORK_HELPER(scrubwrc_helper); 87 87 BTRFS_WORK_HELPER(scrubnc_helper); 88 + BTRFS_WORK_HELPER(scrubparity_helper); 88 89 89 90 static struct __btrfs_workqueue * 90 91 __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+2
fs/btrfs/async-thread.h
··· 64 64 BTRFS_WORK_HELPER_PROTO(scrub_helper); 65 65 BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); 66 66 BTRFS_WORK_HELPER_PROTO(scrubnc_helper); 67 + BTRFS_WORK_HELPER_PROTO(scrubparity_helper); 68 + 67 69 68 70 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 69 71 unsigned int flags,
+41 -18
fs/btrfs/backref.c
··· 250 250 * the first item to check. But sometimes, we may enter it with 251 251 * slot==nritems. In that case, go to the next leaf before we continue. 252 252 */ 253 - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 254 - ret = btrfs_next_old_leaf(root, path, time_seq); 253 + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 254 + if (time_seq == (u64)-1) 255 + ret = btrfs_next_leaf(root, path); 256 + else 257 + ret = btrfs_next_old_leaf(root, path, time_seq); 258 + } 255 259 256 260 while (!ret && count < total_refs) { 257 261 eb = path->nodes[0]; ··· 295 291 eie = NULL; 296 292 } 297 293 next: 298 - ret = btrfs_next_old_item(root, path, time_seq); 294 + if (time_seq == (u64)-1) 295 + ret = btrfs_next_item(root, path); 296 + else 297 + ret = btrfs_next_old_item(root, path, time_seq); 299 298 } 300 299 301 300 if (ret > 0) ··· 341 334 342 335 if (path->search_commit_root) 343 336 root_level = btrfs_header_level(root->commit_root); 337 + else if (time_seq == (u64)-1) 338 + root_level = btrfs_header_level(root->node); 344 339 else 345 340 root_level = btrfs_old_root_level(root, time_seq); 346 341 ··· 352 343 } 353 344 354 345 path->lowest_level = level; 355 - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); 346 + if (time_seq == (u64)-1) 347 + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 348 + 0, 0); 349 + else 350 + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, 351 + time_seq); 356 352 357 353 /* root node has been locked, we can release @subvol_srcu safely here */ 358 354 srcu_read_unlock(&fs_info->subvol_srcu, index); ··· 505 491 BUG_ON(!ref->wanted_disk_byte); 506 492 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 507 493 0); 508 - if (!eb || !extent_buffer_uptodate(eb)) { 494 + if (IS_ERR(eb)) { 495 + return PTR_ERR(eb); 496 + } else if (!extent_buffer_uptodate(eb)) { 509 497 free_extent_buffer(eb); 510 498 return -EIO; 511 499 } ··· 523 507 } 524 508 525 509 /* 526 - * merge two lists of backrefs and adjust counts accordingly 510 + * merge backrefs and adjust counts accordingly 527 511 * 528 512 * mode = 1: merge identical keys, if key is set 529 513 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. ··· 551 535 552 536 ref2 = list_entry(pos2, struct __prelim_ref, list); 553 537 538 + if (!ref_for_same_block(ref1, ref2)) 539 + continue; 554 540 if (mode == 1) { 555 - if (!ref_for_same_block(ref1, ref2)) 556 - continue; 557 541 if (!ref1->parent && ref2->parent) { 558 542 xchg = ref1; 559 543 ref1 = ref2; ··· 588 572 struct list_head *prefs, u64 *total_refs, 589 573 u64 inum) 590 574 { 575 + struct btrfs_delayed_ref_node *node; 591 576 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 592 - struct rb_node *n = &head->node.rb_node; 593 577 struct btrfs_key key; 594 578 struct btrfs_key op_key = {0}; 595 579 int sgn; ··· 599 583 btrfs_disk_key_to_cpu(&op_key, &extent_op->key); 600 584 601 585 spin_lock(&head->lock); 602 - n = rb_first(&head->ref_root); 603 - while (n) { 604 - struct btrfs_delayed_ref_node *node; 605 - node = rb_entry(n, struct btrfs_delayed_ref_node, 606 - rb_node); 607 - n = rb_next(n); 586 + list_for_each_entry(node, &head->ref_list, list) { 608 587 if (node->seq > seq) 609 588 continue; 610 589 ··· 893 882 * 894 883 * NOTE: This can return values > 0 895 884 * 885 + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave 886 + * much like trans == NULL case, the difference only lies in it will not 887 + * commit root. 888 + * The special case is for qgroup to search roots in commit_transaction(). 889 + * 896 890 * FIXME some caching might speed things up 897 891 */ 898 892 static int find_parent_nodes(struct btrfs_trans_handle *trans, ··· 936 920 path->skip_locking = 1; 937 921 } 938 922 923 + if (time_seq == (u64)-1) 924 + path->skip_locking = 1; 925 + 939 926 /* 940 927 * grab both a lock on the path and a lock on the delayed ref head. 941 928 * We need both to get a consistent picture of how the refs look ··· 953 934 BUG_ON(ret == 0); 954 935 955 936 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 956 - if (trans && likely(trans->type != __TRANS_DUMMY)) { 937 + if (trans && likely(trans->type != __TRANS_DUMMY) && 938 + time_seq != (u64)-1) { 957 939 #else 958 - if (trans) { 940 + if (trans && time_seq != (u64)-1) { 959 941 #endif 960 942 /* 961 943 * look if there are updates for this ref queued and lock the ··· 1054 1034 1055 1035 eb = read_tree_block(fs_info->extent_root, 1056 1036 ref->parent, 0); 1057 - if (!eb || !extent_buffer_uptodate(eb)) { 1037 + if (IS_ERR(eb)) { 1038 + ret = PTR_ERR(eb); 1039 + goto out; 1040 + } else if (!extent_buffer_uptodate(eb)) { 1058 1041 free_extent_buffer(eb); 1059 1042 ret = -EIO; 1060 1043 goto out;
+10 -6
fs/btrfs/ctree.c
··· 1439 1439 btrfs_tree_read_unlock(eb_root); 1440 1440 free_extent_buffer(eb_root); 1441 1441 old = read_tree_block(root, logical, 0); 1442 - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1443 - free_extent_buffer(old); 1442 + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1443 + if (!IS_ERR(old)) 1444 + free_extent_buffer(old); 1444 1445 btrfs_warn(root->fs_info, 1445 1446 "failed to read tree block %llu from get_old_root", logical); 1446 1447 } else { ··· 1686 1685 if (!cur || !uptodate) { 1687 1686 if (!cur) { 1688 1687 cur = read_tree_block(root, blocknr, gen); 1689 - if (!cur || !extent_buffer_uptodate(cur)) { 1688 + if (IS_ERR(cur)) { 1689 + return PTR_ERR(cur); 1690 + } else if (!extent_buffer_uptodate(cur)) { 1690 1691 free_extent_buffer(cur); 1691 1692 return -EIO; 1692 1693 } ··· 1867 1864 1868 1865 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1869 1866 btrfs_node_ptr_generation(parent, slot)); 1870 - if (eb && !extent_buffer_uptodate(eb)) { 1871 - free_extent_buffer(eb); 1867 + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { 1868 + if (!IS_ERR(eb)) 1869 + free_extent_buffer(eb); 1872 1870 eb = NULL; 1873 1871 } 1874 1872 ··· 2498 2494 2499 2495 ret = -EAGAIN; 2500 2496 tmp = read_tree_block(root, blocknr, 0); 2501 - if (tmp) { 2497 + if (!IS_ERR(tmp)) { 2502 2498 /* 2503 2499 * If the read above didn't mark this buffer up to date, 2504 2500 * it will never end up being up to date. Set ret to EIO now
+20 -8
fs/btrfs/ctree.h
··· 174 174 /* csum types */ 175 175 #define BTRFS_CSUM_TYPE_CRC32 0 176 176 177 - static int btrfs_csum_sizes[] = { 4, 0 }; 177 + static int btrfs_csum_sizes[] = { 4 }; 178 178 179 179 /* four bytes for CRC32 */ 180 180 #define BTRFS_EMPTY_DIR_SIZE 0 ··· 1619 1619 struct task_struct *cleaner_kthread; 1620 1620 int thread_pool_size; 1621 1621 1622 - struct kobject super_kobj; 1623 1622 struct kobject *space_info_kobj; 1624 - struct kobject *device_dir_kobj; 1625 - struct completion kobj_unregister; 1626 1623 int do_barriers; 1627 1624 int closing; 1628 1625 int log_root_recovering; ··· 1695 1698 struct btrfs_workqueue *scrub_workers; 1696 1699 struct btrfs_workqueue *scrub_wr_completion_workers; 1697 1700 struct btrfs_workqueue *scrub_nocow_workers; 1701 + struct btrfs_workqueue *scrub_parity_workers; 1698 1702 1699 1703 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1700 1704 u32 check_integrity_print_mask; ··· 1733 1735 /* list of dirty qgroups to be written at next commit */ 1734 1736 struct list_head dirty_qgroups; 1735 1737 1736 - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ 1738 + /* used by qgroup for an efficient tree traversal */ 1737 1739 u64 qgroup_seq; 1738 1740 1739 1741 /* qgroup rescan items */ ··· 3456 3458 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3457 3459 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3458 3460 struct btrfs_root *root); 3461 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 3459 3462 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3460 3463 struct inode *inode); 3461 3464 void btrfs_orphan_release_metadata(struct inode *inode); ··· 3514 3515 int __get_raid_index(u64 flags); 3515 3516 int btrfs_start_write_no_snapshoting(struct btrfs_root *root); 3516 3517 void btrfs_end_write_no_snapshoting(struct btrfs_root *root); 3518 + void check_system_chunk(struct btrfs_trans_handle *trans, 3519 + struct btrfs_root *root, 3520 + const u64 type); 3517 3521 /* ctree.c */ 3518 3522 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3519 3523 int level, int *slot); ··· 4052 4050 4053 4051 #ifdef CONFIG_BTRFS_ASSERT 4054 4052 4053 + __cold 4055 4054 static inline void assfail(char *expr, char *file, int line) 4056 4055 { 4057 4056 pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", ··· 4068 4065 4069 4066 #define btrfs_assert() 4070 4067 __printf(5, 6) 4068 + __cold 4071 4069 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 4072 4070 unsigned int line, int errno, const char *fmt, ...); 4073 4071 4074 4072 4073 + __cold 4075 4074 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 4076 4075 struct btrfs_root *root, const char *function, 4077 4076 unsigned int line, int errno); ··· 4116 4111 * Call btrfs_abort_transaction as early as possible when an error condition is 4117 4112 * detected, that way the exact line number is reported. 4118 4113 */ 4119 - 4120 4114 #define btrfs_abort_transaction(trans, root, errno) \ 4121 4115 do { \ 4122 - __btrfs_abort_transaction(trans, root, __func__, \ 4123 - __LINE__, errno); \ 4116 + /* Report first abort since mount */ \ 4117 + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ 4118 + &((root)->fs_info->fs_state))) { \ 4119 + WARN(1, KERN_DEBUG \ 4120 + "BTRFS: Transaction aborted (error %d)\n", \ 4121 + (errno)); \ 4122 + } \ 4123 + __btrfs_abort_transaction((trans), (root), __func__, \ 4124 + __LINE__, (errno)); \ 4124 4125 } while (0) 4125 4126 4126 4127 #define btrfs_std_error(fs_info, errno) \ ··· 4143 4132 } while (0) 4144 4133 4145 4134 __printf(5, 6) 4135 + __cold 4146 4136 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 4147 4137 unsigned int line, int errno, const char *fmt, ...); 4148 4138
+122 -250
fs/btrfs/delayed-ref.c
··· 22 22 #include "ctree.h" 23 23 #include "delayed-ref.h" 24 24 #include "transaction.h" 25 + #include "qgroup.h" 25 26 26 27 struct kmem_cache *btrfs_delayed_ref_head_cachep; 27 28 struct kmem_cache *btrfs_delayed_tree_ref_cachep; ··· 83 82 return 1; 84 83 } 85 84 return 0; 86 - } 87 - 88 - /* 89 - * entries in the rb tree are ordered by the byte number of the extent, 90 - * type of the delayed backrefs and content of delayed backrefs. 91 - */ 92 - static int comp_entry(struct btrfs_delayed_ref_node *ref2, 93 - struct btrfs_delayed_ref_node *ref1, 94 - bool compare_seq) 95 - { 96 - if (ref1->bytenr < ref2->bytenr) 97 - return -1; 98 - if (ref1->bytenr > ref2->bytenr) 99 - return 1; 100 - if (ref1->is_head && ref2->is_head) 101 - return 0; 102 - if (ref2->is_head) 103 - return -1; 104 - if (ref1->is_head) 105 - return 1; 106 - if (ref1->type < ref2->type) 107 - return -1; 108 - if (ref1->type > ref2->type) 109 - return 1; 110 - if (ref1->no_quota > ref2->no_quota) 111 - return 1; 112 - if (ref1->no_quota < ref2->no_quota) 113 - return -1; 114 - /* merging of sequenced refs is not allowed */ 115 - if (compare_seq) { 116 - if (ref1->seq < ref2->seq) 117 - return -1; 118 - if (ref1->seq > ref2->seq) 119 - return 1; 120 - } 121 - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 122 - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 123 - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 124 - btrfs_delayed_node_to_tree_ref(ref1), 125 - ref1->type); 126 - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || 127 - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 128 - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), 129 - btrfs_delayed_node_to_data_ref(ref1)); 130 - } 131 - BUG(); 132 - return 0; 133 - } 134 - 135 - /* 136 - * insert a new ref into the rbtree. This returns any existing refs 137 - * for the same (bytenr,parent) tuple, or NULL if the new node was properly 138 - * inserted. 139 - */ 140 - static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 141 - struct rb_node *node) 142 - { 143 - struct rb_node **p = &root->rb_node; 144 - struct rb_node *parent_node = NULL; 145 - struct btrfs_delayed_ref_node *entry; 146 - struct btrfs_delayed_ref_node *ins; 147 - int cmp; 148 - 149 - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 150 - while (*p) { 151 - parent_node = *p; 152 - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 153 - rb_node); 154 - 155 - cmp = comp_entry(entry, ins, 1); 156 - if (cmp < 0) 157 - p = &(*p)->rb_left; 158 - else if (cmp > 0) 159 - p = &(*p)->rb_right; 160 - else 161 - return entry; 162 - } 163 - 164 - rb_link_node(node, parent_node, p); 165 - rb_insert_color(node, root); 166 - return NULL; 167 85 } 168 86 169 87 /* insert a new ref to head ref rbtree */ ··· 188 268 rb_erase(&head->href_node, &delayed_refs->href_root); 189 269 } else { 190 270 assert_spin_locked(&head->lock); 191 - rb_erase(&ref->rb_node, &head->ref_root); 271 + list_del(&ref->list); 192 272 } 193 273 ref->in_tree = 0; 194 274 btrfs_put_delayed_ref(ref); 195 275 atomic_dec(&delayed_refs->num_entries); 196 276 if (trans->delayed_ref_updates) 197 277 trans->delayed_ref_updates--; 198 - } 199 - 200 - static int merge_ref(struct btrfs_trans_handle *trans, 201 - struct btrfs_delayed_ref_root *delayed_refs, 202 - struct btrfs_delayed_ref_head *head, 203 - struct btrfs_delayed_ref_node *ref, u64 seq) 204 - { 205 - struct rb_node *node; 206 - int mod = 0; 207 - int done = 0; 208 - 209 - node = rb_next(&ref->rb_node); 210 - while (!done && node) { 211 - struct btrfs_delayed_ref_node *next; 212 - 213 - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 214 - node = rb_next(node); 215 - if (seq && next->seq >= seq) 216 - break; 217 - if (comp_entry(ref, next, 0)) 218 - continue; 219 - 220 - if (ref->action == next->action) { 221 - mod = next->ref_mod; 222 - } else { 223 - if (ref->ref_mod < next->ref_mod) { 224 - struct btrfs_delayed_ref_node *tmp; 225 - 226 - tmp = ref; 227 - ref = next; 228 - next = tmp; 229 - done = 1; 230 - } 231 - mod = -next->ref_mod; 232 - } 233 - 234 - drop_delayed_ref(trans, delayed_refs, head, next); 235 - ref->ref_mod += mod; 236 - if (ref->ref_mod == 0) { 237 - drop_delayed_ref(trans, delayed_refs, head, ref); 238 - done = 1; 239 - } else { 240 - /* 241 - * You can't have multiples of the same ref on a tree 242 - * block. 243 - */ 244 - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 245 - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 246 - } 247 - } 248 - return done; 249 - } 250 - 251 - void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 252 - struct btrfs_fs_info *fs_info, 253 - struct btrfs_delayed_ref_root *delayed_refs, 254 - struct btrfs_delayed_ref_head *head) 255 - { 256 - struct rb_node *node; 257 - u64 seq = 0; 258 - 259 - assert_spin_locked(&head->lock); 260 - /* 261 - * We don't have too much refs to merge in the case of delayed data 262 - * refs. 263 - */ 264 - if (head->is_data) 265 - return; 266 - 267 - spin_lock(&fs_info->tree_mod_seq_lock); 268 - if (!list_empty(&fs_info->tree_mod_seq_list)) { 269 - struct seq_list *elem; 270 - 271 - elem = list_first_entry(&fs_info->tree_mod_seq_list, 272 - struct seq_list, list); 273 - seq = elem->seq; 274 - } 275 - spin_unlock(&fs_info->tree_mod_seq_lock); 276 - 277 - node = rb_first(&head->ref_root); 278 - while (node) { 279 - struct btrfs_delayed_ref_node *ref; 280 - 281 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 282 - rb_node); 283 - /* We can't merge refs that are outside of our seq count */ 284 - if (seq && ref->seq >= seq) 285 - break; 286 - if (merge_ref(trans, delayed_refs, head, ref, seq)) 287 - node = rb_first(&head->ref_root); 288 - else 289 - node = rb_next(&ref->rb_node); 290 - } 291 278 } 292 279 293 280 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, ··· 270 443 } 271 444 272 445 /* 273 - * helper function to update an extent delayed ref in the 274 - * rbtree. existing and update must both have the same 275 - * bytenr and parent 446 + * Helper to insert the ref_node to the tail or merge with tail. 276 447 * 277 - * This may free existing if the update cancels out whatever 278 - * operation it was doing. 448 + * Return 0 for insert. 449 + * Return >0 for merge. 279 450 */ 280 - static noinline void 281 - update_existing_ref(struct btrfs_trans_handle *trans, 282 - struct btrfs_delayed_ref_root *delayed_refs, 283 - struct btrfs_delayed_ref_head *head, 284 - struct btrfs_delayed_ref_node *existing, 285 - struct btrfs_delayed_ref_node *update) 451 + static int 452 + add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, 453 + struct btrfs_delayed_ref_root *root, 454 + struct btrfs_delayed_ref_head *href, 455 + struct btrfs_delayed_ref_node *ref) 286 456 { 287 - if (update->action != existing->action) { 288 - /* 289 - * this is effectively undoing either an add or a 290 - * drop. We decrement the ref_mod, and if it goes 291 - * down to zero we just delete the entry without 292 - * every changing the extent allocation tree. 293 - */ 294 - existing->ref_mod--; 295 - if (existing->ref_mod == 0) 296 - drop_delayed_ref(trans, delayed_refs, head, existing); 297 - else 298 - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 299 - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 457 + struct btrfs_delayed_ref_node *exist; 458 + int mod; 459 + int ret = 0; 460 + 461 + spin_lock(&href->lock); 462 + /* Check whether we can merge the tail node with ref */ 463 + if (list_empty(&href->ref_list)) 464 + goto add_tail; 465 + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, 466 + list); 467 + /* No need to compare bytenr nor is_head */ 468 + if (exist->type != ref->type || exist->no_quota != ref->no_quota || 469 + exist->seq != ref->seq) 470 + goto add_tail; 471 + 472 + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || 473 + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && 474 + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), 475 + btrfs_delayed_node_to_tree_ref(ref), 476 + ref->type)) 477 + goto add_tail; 478 + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || 479 + exist->type == BTRFS_SHARED_DATA_REF_KEY) && 480 + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), 481 + btrfs_delayed_node_to_data_ref(ref))) 482 + goto add_tail; 483 + 484 + /* Now we are sure we can merge */ 485 + ret = 1; 486 + if (exist->action == ref->action) { 487 + mod = ref->ref_mod; 300 488 } else { 301 - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 302 - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 303 - /* 304 - * the action on the existing ref matches 305 - * the action on the ref we're trying to add. 306 - * Bump the ref_mod by one so the backref that 307 - * is eventually added/removed has the correct 308 - * reference count 309 - */ 310 - existing->ref_mod += update->ref_mod; 489 + /* Need to change action */ 490 + if (exist->ref_mod < ref->ref_mod) { 491 + exist->action = ref->action; 492 + mod = -exist->ref_mod; 493 + exist->ref_mod = ref->ref_mod; 494 + } else 495 + mod = -ref->ref_mod; 311 496 } 497 + exist->ref_mod += mod; 498 + 499 + /* remove existing tail if its ref_mod is zero */ 500 + if (exist->ref_mod == 0) 501 + drop_delayed_ref(trans, root, href, exist); 502 + spin_unlock(&href->lock); 503 + return ret; 504 + 505 + add_tail: 506 + list_add_tail(&ref->list, &href->ref_list); 507 + atomic_inc(&root->num_entries); 508 + trans->delayed_ref_updates++; 509 + spin_unlock(&href->lock); 510 + return ret; 312 511 } 313 512 314 513 /* ··· 421 568 static noinline struct btrfs_delayed_ref_head * 422 569 add_delayed_ref_head(struct btrfs_fs_info *fs_info, 423 570 struct btrfs_trans_handle *trans, 424 - struct btrfs_delayed_ref_node *ref, u64 bytenr, 425 - u64 num_bytes, int action, int is_data) 571 + struct btrfs_delayed_ref_node *ref, 572 + struct btrfs_qgroup_extent_record *qrecord, 573 + u64 bytenr, u64 num_bytes, int action, int is_data) 426 574 { 427 575 struct btrfs_delayed_ref_head *existing; 428 576 struct btrfs_delayed_ref_head *head_ref = NULL; 429 577 struct btrfs_delayed_ref_root *delayed_refs; 578 + struct btrfs_qgroup_extent_record *qexisting; 430 579 int count_mod = 1; 431 580 int must_insert_reserved = 0; 432 581 ··· 473 618 head_ref = btrfs_delayed_node_to_head(ref); 474 619 head_ref->must_insert_reserved = must_insert_reserved; 475 620 head_ref->is_data = is_data; 476 - head_ref->ref_root = RB_ROOT; 621 + INIT_LIST_HEAD(&head_ref->ref_list); 477 622 head_ref->processing = 0; 478 623 head_ref->total_ref_mod = count_mod; 624 + 625 + /* Record qgroup extent info if provided */ 626 + if (qrecord) { 627 + qrecord->bytenr = bytenr; 628 + qrecord->num_bytes = num_bytes; 629 + qrecord->old_roots = NULL; 630 + 631 + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, 632 + qrecord); 633 + if (qexisting) 634 + kfree(qrecord); 635 + } 479 636 480 637 spin_lock_init(&head_ref->lock); 481 638 mutex_init(&head_ref->mutex); ··· 526 659 u64 num_bytes, u64 parent, u64 ref_root, int level, 527 660 int action, int no_quota) 528 661 { 529 - struct btrfs_delayed_ref_node *existing; 530 662 struct btrfs_delayed_tree_ref *full_ref; 531 663 struct btrfs_delayed_ref_root *delayed_refs; 532 664 u64 seq = 0; 665 + int ret; 533 666 534 667 if (action == BTRFS_ADD_DELAYED_EXTENT) 535 668 action = BTRFS_ADD_DELAYED_REF; ··· 560 693 561 694 trace_add_delayed_tree_ref(ref, full_ref, action); 562 695 563 - spin_lock(&head_ref->lock); 564 - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); 565 - if (existing) { 566 - update_existing_ref(trans, delayed_refs, head_ref, existing, 567 - ref); 568 - /* 569 - * we've updated the existing ref, free the newly 570 - * allocated ref 571 - */ 696 + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 697 + 698 + /* 699 + * XXX: memory should be freed at the same level allocated. 700 + * But bad practice is anywhere... Follow it now. Need cleanup. 701 + */ 702 + if (ret > 0) 572 703 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); 573 - } else { 574 - atomic_inc(&delayed_refs->num_entries); 575 - trans->delayed_ref_updates++; 576 - } 577 - spin_unlock(&head_ref->lock); 578 704 } 579 705 580 706 /* ··· 581 721 u64 num_bytes, u64 parent, u64 ref_root, u64 owner, 582 722 u64 offset, int action, int no_quota) 583 723 { 584 - struct btrfs_delayed_ref_node *existing; 585 724 struct btrfs_delayed_data_ref *full_ref; 586 725 struct btrfs_delayed_ref_root *delayed_refs; 587 726 u64 seq = 0; 727 + int ret; 588 728 589 729 if (action == BTRFS_ADD_DELAYED_EXTENT) 590 730 action = BTRFS_ADD_DELAYED_REF; ··· 618 758 619 759 trace_add_delayed_data_ref(ref, full_ref, action); 620 760 621 - spin_lock(&head_ref->lock); 622 - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); 623 - if (existing) { 624 - update_existing_ref(trans, delayed_refs, head_ref, existing, 625 - ref); 626 - /* 627 - * we've updated the existing ref, free the newly 628 - * allocated ref 629 - */ 761 + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); 762 + 763 + if (ret > 0) 630 764 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 631 - } else { 632 - atomic_inc(&delayed_refs->num_entries); 633 - trans->delayed_ref_updates++; 634 - } 635 - spin_unlock(&head_ref->lock); 636 765 } 637 766 638 767 /* ··· 639 790 struct btrfs_delayed_tree_ref *ref; 640 791 struct btrfs_delayed_ref_head *head_ref; 641 792 struct btrfs_delayed_ref_root *delayed_refs; 793 + struct btrfs_qgroup_extent_record *record = NULL; 642 794 643 795 if (!is_fstree(ref_root) || !fs_info->quota_enabled) 644 796 no_quota = 0; ··· 650 800 return -ENOMEM; 651 801 652 802 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 653 - if (!head_ref) { 654 - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 655 - return -ENOMEM; 803 + if (!head_ref) 804 + goto free_ref; 805 + 806 + if (fs_info->quota_enabled && is_fstree(ref_root)) { 807 + record = kmalloc(sizeof(*record), GFP_NOFS); 808 + if (!record) 809 + goto free_head_ref; 656 810 } 657 811 658 812 head_ref->extent_op = extent_op; ··· 668 814 * insert both the head node and the new ref without dropping 669 815 * the spin lock 670 816 */ 671 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, 817 + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 672 818 bytenr, num_bytes, action, 0); 673 819 674 820 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, ··· 677 823 spin_unlock(&delayed_refs->lock); 678 824 679 825 return 0; 826 + 827 + free_head_ref: 828 + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 829 + free_ref: 830 + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 831 + 832 + return -ENOMEM; 680 833 } 681 834 682 835 /* ··· 700 839 struct btrfs_delayed_data_ref *ref; 701 840 struct btrfs_delayed_ref_head *head_ref; 702 841 struct btrfs_delayed_ref_root *delayed_refs; 842 + struct btrfs_qgroup_extent_record *record = NULL; 703 843 704 844 if (!is_fstree(ref_root) || !fs_info->quota_enabled) 705 845 no_quota = 0; ··· 716 854 return -ENOMEM; 717 855 } 718 856 857 + if (fs_info->quota_enabled && is_fstree(ref_root)) { 858 + record = kmalloc(sizeof(*record), GFP_NOFS); 859 + if (!record) { 860 + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 861 + kmem_cache_free(btrfs_delayed_ref_head_cachep, 862 + head_ref); 863 + return -ENOMEM; 864 + } 865 + } 866 + 719 867 head_ref->extent_op = extent_op; 720 868 721 869 delayed_refs = &trans->transaction->delayed_refs; ··· 735 863 * insert both the head node and the new ref without dropping 736 864 * the spin lock 737 865 */ 738 - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, 866 + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 739 867 bytenr, num_bytes, action, 1); 740 868 741 869 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, ··· 763 891 delayed_refs = &trans->transaction->delayed_refs; 764 892 spin_lock(&delayed_refs->lock); 765 893 766 - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 767 - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 768 - extent_op->is_data); 894 + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 895 + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 896 + extent_op->is_data); 769 897 770 898 spin_unlock(&delayed_refs->lock); 771 899 return 0;
+28 -1
fs/btrfs/delayed-ref.h
··· 24 24 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ 25 25 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ 26 26 27 + /* 28 + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the 29 + * same ref_node structure. 30 + * Ref_head is in a higher logic level than tree/data ref, and duplicated 31 + * bytenr/num_bytes in ref_node is really a waste or memory, they should be 32 + * referred from ref_head. 33 + * This gets more disgusting after we use list to store tree/data ref in 34 + * ref_head. Must clean this mess up later. 35 + */ 27 36 struct btrfs_delayed_ref_node { 37 + /* 38 + * ref_head use rb tree, stored in ref_root->href. 39 + * indexed by bytenr 40 + */ 28 41 struct rb_node rb_node; 42 + 43 + /*data/tree ref use list, stored in ref_head->ref_list. */ 44 + struct list_head list; 29 45 30 46 /* the starting bytenr of the extent */ 31 47 u64 bytenr; ··· 99 83 struct mutex mutex; 100 84 101 85 spinlock_t lock; 102 - struct rb_root ref_root; 86 + struct list_head ref_list; 103 87 104 88 struct rb_node href_node; 105 89 ··· 148 132 /* head ref rbtree */ 149 133 struct rb_root href_root; 150 134 135 + /* dirty extent records */ 136 + struct rb_root dirty_extent_root; 137 + 151 138 /* this spin lock protects the rbtree and the entries inside */ 152 139 spinlock_t lock; 153 140 ··· 175 156 int flushing; 176 157 177 158 u64 run_delayed_start; 159 + 160 + /* 161 + * To make qgroup to skip given root. 162 + * This is for snapshot, as btrfs_qgroup_inherit() will manully 163 + * modify counters for snapshot and its source, so we should skip 164 + * the snapshot in new_root/old_roots or it will get calculated twice 165 + */ 166 + u64 qgroup_to_skip; 178 167 }; 179 168 180 169 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+5 -2
fs/btrfs/dev-replace.c
··· 376 376 WARN_ON(!tgt_device); 377 377 dev_replace->tgtdev = tgt_device; 378 378 379 + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); 380 + if (ret) 381 + btrfs_error(root->fs_info, ret, "kobj add dev failed"); 382 + 379 383 printk_in_rcu(KERN_INFO 380 384 "BTRFS: dev_replace from %s (devid %llu) to %s started\n", 381 385 src_device->missing ? "<missing disk>" : ··· 587 583 mutex_unlock(&uuid_mutex); 588 584 589 585 /* replace the sysfs entry */ 590 - btrfs_kobj_rm_device(fs_info, src_device); 591 - btrfs_kobj_add_device(fs_info, tgt_device); 586 + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); 592 587 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); 593 588 594 589 /* write back the superblocks */
+37 -19
fs/btrfs/disk-io.c
··· 1149 1149 1150 1150 buf = btrfs_find_create_tree_block(root, bytenr); 1151 1151 if (!buf) 1152 - return NULL; 1152 + return ERR_PTR(-ENOMEM); 1153 1153 1154 1154 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 1155 1155 if (ret) { 1156 1156 free_extent_buffer(buf); 1157 - return NULL; 1157 + return ERR_PTR(ret); 1158 1158 } 1159 1159 return buf; 1160 1160 ··· 1509 1509 generation = btrfs_root_generation(&root->root_item); 1510 1510 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1511 1511 generation); 1512 - if (!root->node) { 1513 - ret = -ENOMEM; 1512 + if (IS_ERR(root->node)) { 1513 + ret = PTR_ERR(root->node); 1514 1514 goto find_fail; 1515 1515 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { 1516 1516 ret = -EIO; 1517 - goto read_fail; 1517 + free_extent_buffer(root->node); 1518 + goto find_fail; 1518 1519 } 1519 1520 root->commit_root = btrfs_root_node(root); 1520 1521 out: 1521 1522 btrfs_free_path(path); 1522 1523 return root; 1523 1524 1524 - read_fail: 1525 - free_extent_buffer(root->node); 1526 1525 find_fail: 1527 1526 kfree(root); 1528 1527 alloc_fail: ··· 2319 2320 2320 2321 log_tree_root->node = read_tree_block(tree_root, bytenr, 2321 2322 fs_info->generation + 1); 2322 - if (!log_tree_root->node || 2323 - !extent_buffer_uptodate(log_tree_root->node)) { 2323 + if (IS_ERR(log_tree_root->node)) { 2324 + printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2325 + ret = PTR_ERR(log_tree_root->node); 2326 + kfree(log_tree_root); 2327 + return ret; 2328 + } else if (!extent_buffer_uptodate(log_tree_root->node)) { 2324 2329 printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2325 2330 free_extent_buffer(log_tree_root->node); 2326 2331 kfree(log_tree_root); ··· 2497 2494 seqlock_init(&fs_info->profiles_lock); 2498 2495 init_rwsem(&fs_info->delayed_iput_sem); 2499 2496 2500 - init_completion(&fs_info->kobj_unregister); 2501 2497 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2502 2498 INIT_LIST_HEAD(&fs_info->space_info); 2503 2499 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); ··· 2799 2797 chunk_root->node = read_tree_block(chunk_root, 2800 2798 btrfs_super_chunk_root(disk_super), 2801 2799 generation); 2802 - if (!chunk_root->node || 2803 - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2800 + if (IS_ERR(chunk_root->node) || 2801 + !extent_buffer_uptodate(chunk_root->node)) { 2804 2802 printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", 2805 2803 sb->s_id); 2806 2804 goto fail_tree_roots; ··· 2836 2834 tree_root->node = read_tree_block(tree_root, 2837 2835 btrfs_super_root(disk_super), 2838 2836 generation); 2839 - if (!tree_root->node || 2840 - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2837 + if (IS_ERR(tree_root->node) || 2838 + !extent_buffer_uptodate(tree_root->node)) { 2841 2839 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2842 2840 sb->s_id); 2843 2841 ··· 2876 2874 2877 2875 btrfs_close_extra_devices(fs_devices, 1); 2878 2876 2877 + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); 2878 + if (ret) { 2879 + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); 2880 + goto fail_block_groups; 2881 + } 2882 + 2883 + ret = btrfs_sysfs_add_device(fs_devices); 2884 + if (ret) { 2885 + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); 2886 + goto fail_fsdev_sysfs; 2887 + } 2888 + 2879 2889 ret = btrfs_sysfs_add_one(fs_info); 2880 2890 if (ret) { 2881 2891 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); 2882 - goto fail_block_groups; 2892 + goto fail_fsdev_sysfs; 2883 2893 } 2884 2894 2885 2895 ret = btrfs_init_space_info(fs_info); ··· 3068 3054 3069 3055 fail_sysfs: 3070 3056 btrfs_sysfs_remove_one(fs_info); 3057 + 3058 + fail_fsdev_sysfs: 3059 + btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3071 3060 3072 3061 fail_block_groups: 3073 3062 btrfs_put_block_group_cache(fs_info); ··· 3742 3725 } 3743 3726 3744 3727 btrfs_sysfs_remove_one(fs_info); 3728 + btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3745 3729 3746 3730 btrfs_free_fs_roots(fs_info); 3747 3731 ··· 4071 4053 4072 4054 while ((node = rb_first(&delayed_refs->href_root)) != NULL) { 4073 4055 struct btrfs_delayed_ref_head *head; 4056 + struct btrfs_delayed_ref_node *tmp; 4074 4057 bool pin_bytes = false; 4075 4058 4076 4059 head = rb_entry(node, struct btrfs_delayed_ref_head, ··· 4087 4068 continue; 4088 4069 } 4089 4070 spin_lock(&head->lock); 4090 - while ((node = rb_first(&head->ref_root)) != NULL) { 4091 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 4092 - rb_node); 4071 + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, 4072 + list) { 4093 4073 ref->in_tree = 0; 4094 - rb_erase(&ref->rb_node, &head->ref_root); 4074 + list_del(&ref->list); 4095 4075 atomic_dec(&delayed_refs->num_entries); 4096 4076 btrfs_put_delayed_ref(ref); 4097 4077 }
+133 -175
fs/btrfs/extent-tree.c
··· 79 79 u64 num_bytes, int alloc); 80 80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 81 81 struct btrfs_root *root, 82 - u64 bytenr, u64 num_bytes, u64 parent, 82 + struct btrfs_delayed_ref_node *node, u64 parent, 83 83 u64 root_objectid, u64 owner_objectid, 84 84 u64 owner_offset, int refs_to_drop, 85 - struct btrfs_delayed_extent_op *extra_op, 86 - int no_quota); 85 + struct btrfs_delayed_extent_op *extra_op); 87 86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 87 struct extent_buffer *leaf, 89 88 struct btrfs_extent_item *ei); ··· 1966 1967 1967 1968 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1968 1969 struct btrfs_root *root, 1969 - u64 bytenr, u64 num_bytes, 1970 + struct btrfs_delayed_ref_node *node, 1970 1971 u64 parent, u64 root_objectid, 1971 1972 u64 owner, u64 offset, int refs_to_add, 1972 - int no_quota, 1973 1973 struct btrfs_delayed_extent_op *extent_op) 1974 1974 { 1975 1975 struct btrfs_fs_info *fs_info = root->fs_info; ··· 1976 1978 struct extent_buffer *leaf; 1977 1979 struct btrfs_extent_item *item; 1978 1980 struct btrfs_key key; 1981 + u64 bytenr = node->bytenr; 1982 + u64 num_bytes = node->num_bytes; 1979 1983 u64 refs; 1980 1984 int ret; 1981 - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1985 + int no_quota = node->no_quota; 1982 1986 1983 1987 path = btrfs_alloc_path(); 1984 1988 if (!path) ··· 1996 1996 bytenr, num_bytes, parent, 1997 1997 root_objectid, owner, offset, 1998 1998 refs_to_add, extent_op); 1999 - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 1999 + if ((ret < 0 && ret != -EAGAIN) || !ret) 2000 2000 goto out; 2001 - /* 2002 - * Ok we were able to insert an inline extent and it appears to be a new 2003 - * reference, deal with the qgroup accounting. 2004 - */ 2005 - if (!ret && !no_quota) { 2006 - ASSERT(root->fs_info->quota_enabled); 2007 - leaf = path->nodes[0]; 2008 - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 - item = btrfs_item_ptr(leaf, path->slots[0], 2010 - struct btrfs_extent_item); 2011 - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2012 - type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 - btrfs_release_path(path); 2014 - 2015 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2016 - bytenr, num_bytes, type, 0); 2017 - goto out; 2018 - } 2019 2001 2020 2002 /* 2021 2003 * Ok we had -EAGAIN which means we didn't have space to insert and ··· 2008 2026 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 2027 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2010 2028 refs = btrfs_extent_refs(leaf, item); 2011 - if (refs) 2012 - type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 2029 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2014 2030 if (extent_op) 2015 2031 __run_delayed_extent_op(extent_op, leaf, item); 2016 2032 2017 2033 btrfs_mark_buffer_dirty(leaf); 2018 2034 btrfs_release_path(path); 2019 - 2020 - if (!no_quota) { 2021 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2022 - bytenr, num_bytes, type, 0); 2023 - if (ret) 2024 - goto out; 2025 - } 2026 2035 2027 2036 path->reada = 1; 2028 2037 path->leave_spinning = 1; ··· 2060 2087 ref->objectid, ref->offset, 2061 2088 &ins, node->ref_mod); 2062 2089 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2063 - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2064 - node->num_bytes, parent, 2090 + ret = __btrfs_inc_extent_ref(trans, root, node, parent, 2065 2091 ref_root, ref->objectid, 2066 2092 ref->offset, node->ref_mod, 2067 - node->no_quota, extent_op); 2093 + extent_op); 2068 2094 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2069 - ret = __btrfs_free_extent(trans, root, node->bytenr, 2070 - node->num_bytes, parent, 2095 + ret = __btrfs_free_extent(trans, root, node, parent, 2071 2096 ref_root, ref->objectid, 2072 2097 ref->offset, node->ref_mod, 2073 - extent_op, node->no_quota); 2098 + extent_op); 2074 2099 } else { 2075 2100 BUG(); 2076 2101 } ··· 2226 2255 ref->level, &ins, 2227 2256 node->no_quota); 2228 2257 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2229 - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2230 - node->num_bytes, parent, ref_root, 2231 - ref->level, 0, 1, node->no_quota, 2258 + ret = __btrfs_inc_extent_ref(trans, root, node, 2259 + parent, ref_root, 2260 + ref->level, 0, 1, 2232 2261 extent_op); 2233 2262 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2234 - ret = __btrfs_free_extent(trans, root, node->bytenr, 2235 - node->num_bytes, parent, ref_root, 2236 - ref->level, 0, 1, extent_op, 2237 - node->no_quota); 2263 + ret = __btrfs_free_extent(trans, root, node, 2264 + parent, ref_root, 2265 + ref->level, 0, 1, extent_op); 2238 2266 } else { 2239 2267 BUG(); 2240 2268 } ··· 2293 2323 return ret; 2294 2324 } 2295 2325 2296 - static noinline struct btrfs_delayed_ref_node * 2326 + static inline struct btrfs_delayed_ref_node * 2297 2327 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2298 2328 { 2299 - struct rb_node *node; 2300 - struct btrfs_delayed_ref_node *ref, *last = NULL;; 2329 + if (list_empty(&head->ref_list)) 2330 + return NULL; 2301 2331 2302 - /* 2303 - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2304 - * this prevents ref count from going down to zero when 2305 - * there still are pending delayed ref. 2306 - */ 2307 - node = rb_first(&head->ref_root); 2308 - while (node) { 2309 - ref = rb_entry(node, struct btrfs_delayed_ref_node, 2310 - rb_node); 2311 - if (ref->action == BTRFS_ADD_DELAYED_REF) 2312 - return ref; 2313 - else if (last == NULL) 2314 - last = ref; 2315 - node = rb_next(node); 2316 - } 2317 - return last; 2332 + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, 2333 + list); 2318 2334 } 2319 2335 2320 2336 /* ··· 2352 2396 } 2353 2397 } 2354 2398 2355 - /* 2356 - * We need to try and merge add/drops of the same ref since we 2357 - * can run into issues with relocate dropping the implicit ref 2358 - * and then it being added back again before the drop can 2359 - * finish. If we merged anything we need to re-loop so we can 2360 - * get a good ref. 2361 - */ 2362 2399 spin_lock(&locked_ref->lock); 2363 - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2364 - locked_ref); 2365 2400 2366 2401 /* 2367 2402 * locked_ref is the head node, so we have to go one ··· 2429 2482 spin_unlock(&locked_ref->lock); 2430 2483 spin_lock(&delayed_refs->lock); 2431 2484 spin_lock(&locked_ref->lock); 2432 - if (rb_first(&locked_ref->ref_root) || 2485 + if (!list_empty(&locked_ref->ref_list) || 2433 2486 locked_ref->extent_op) { 2434 2487 spin_unlock(&locked_ref->lock); 2435 2488 spin_unlock(&delayed_refs->lock); ··· 2443 2496 } else { 2444 2497 actual_count++; 2445 2498 ref->in_tree = 0; 2446 - rb_erase(&ref->rb_node, &locked_ref->ref_root); 2499 + list_del(&ref->list); 2447 2500 } 2448 2501 atomic_dec(&delayed_refs->num_entries); 2449 2502 ··· 2811 2864 goto again; 2812 2865 } 2813 2866 out: 2814 - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2815 - if (ret) 2816 - return ret; 2817 2867 assert_qgroups_uptodate(trans); 2818 2868 return 0; 2819 2869 } ··· 2849 2905 struct btrfs_delayed_ref_node *ref; 2850 2906 struct btrfs_delayed_data_ref *data_ref; 2851 2907 struct btrfs_delayed_ref_root *delayed_refs; 2852 - struct rb_node *node; 2853 2908 int ret = 0; 2854 2909 2855 2910 delayed_refs = &trans->transaction->delayed_refs; ··· 2877 2934 spin_unlock(&delayed_refs->lock); 2878 2935 2879 2936 spin_lock(&head->lock); 2880 - node = rb_first(&head->ref_root); 2881 - while (node) { 2882 - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2883 - node = rb_next(node); 2884 - 2937 + list_for_each_entry(ref, &head->ref_list, list) { 2885 2938 /* If it's a shared ref we know a cross reference exists */ 2886 2939 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2887 2940 ret = 1; ··· 3632 3693 found->disk_total += total_bytes * factor; 3633 3694 found->bytes_used += bytes_used; 3634 3695 found->disk_used += bytes_used * factor; 3635 - found->full = 0; 3696 + if (total_bytes > 0) 3697 + found->full = 0; 3636 3698 spin_unlock(&found->lock); 3637 3699 *space_info = found; 3638 3700 return 0; ··· 3661 3721 found->bytes_reserved = 0; 3662 3722 found->bytes_readonly = 0; 3663 3723 found->bytes_may_use = 0; 3664 - found->full = 0; 3724 + if (total_bytes > 0) 3725 + found->full = 0; 3726 + else 3727 + found->full = 1; 3665 3728 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3666 3729 found->chunk_alloc = 0; 3667 3730 found->flush = 0; ··· 3918 3975 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3919 3976 need_commit--; 3920 3977 3978 + if (need_commit > 0) 3979 + btrfs_wait_ordered_roots(fs_info, -1); 3980 + 3921 3981 trans = btrfs_join_transaction(root); 3922 3982 if (IS_ERR(trans)) 3923 3983 return PTR_ERR(trans); ··· 4034 4088 return 1; 4035 4089 } 4036 4090 4037 - static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 4091 + static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) 4038 4092 { 4039 4093 u64 num_dev; 4040 4094 ··· 4048 4102 else 4049 4103 num_dev = 1; /* DUP or single */ 4050 4104 4051 - /* metadata for updaing devices and chunk tree */ 4052 - return btrfs_calc_trans_metadata_size(root, num_dev + 1); 4105 + return num_dev; 4053 4106 } 4054 4107 4055 - static void check_system_chunk(struct btrfs_trans_handle *trans, 4056 - struct btrfs_root *root, u64 type) 4108 + /* 4109 + * If @is_allocation is true, reserve space in the system space info necessary 4110 + * for allocating a chunk, otherwise if it's false, reserve space necessary for 4111 + * removing a chunk. 4112 + */ 4113 + void check_system_chunk(struct btrfs_trans_handle *trans, 4114 + struct btrfs_root *root, 4115 + u64 type) 4057 4116 { 4058 4117 struct btrfs_space_info *info; 4059 4118 u64 left; 4060 4119 u64 thresh; 4120 + int ret = 0; 4121 + u64 num_devs; 4122 + 4123 + /* 4124 + * Needed because we can end up allocating a system chunk and for an 4125 + * atomic and race free space reservation in the chunk block reserve. 4126 + */ 4127 + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); 4061 4128 4062 4129 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4063 4130 spin_lock(&info->lock); 4064 4131 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 4065 - info->bytes_reserved - info->bytes_readonly; 4132 + info->bytes_reserved - info->bytes_readonly - 4133 + info->bytes_may_use; 4066 4134 spin_unlock(&info->lock); 4067 4135 4068 - thresh = get_system_chunk_thresh(root, type); 4136 + num_devs = get_profile_num_devs(root, type); 4137 + 4138 + /* num_devs device items to update and 1 chunk item to add or remove */ 4139 + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + 4140 + btrfs_calc_trans_metadata_size(root, 1); 4141 + 4069 4142 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 4070 4143 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 4071 4144 left, thresh, type); ··· 4095 4130 u64 flags; 4096 4131 4097 4132 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 4098 - btrfs_alloc_chunk(trans, root, flags); 4133 + /* 4134 + * Ignore failure to create system chunk. We might end up not 4135 + * needing it, as we might not need to COW all nodes/leafs from 4136 + * the paths we visit in the chunk tree (they were already COWed 4137 + * or created in the current transaction for example). 4138 + */ 4139 + ret = btrfs_alloc_chunk(trans, root, flags); 4140 + } 4141 + 4142 + if (!ret) { 4143 + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, 4144 + &root->fs_info->chunk_block_rsv, 4145 + thresh, BTRFS_RESERVE_NO_FLUSH); 4146 + if (!ret) 4147 + trans->chunk_bytes_reserved += thresh; 4099 4148 } 4100 4149 } 4101 4150 ··· 5167 5188 trans->bytes_reserved = 0; 5168 5189 } 5169 5190 5191 + /* 5192 + * To be called after all the new block groups attached to the transaction 5193 + * handle have been created (btrfs_create_pending_block_groups()). 5194 + */ 5195 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5196 + { 5197 + struct btrfs_fs_info *fs_info = trans->root->fs_info; 5198 + 5199 + if (!trans->chunk_bytes_reserved) 5200 + return; 5201 + 5202 + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5203 + 5204 + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5205 + trans->chunk_bytes_reserved); 5206 + trans->chunk_bytes_reserved = 0; 5207 + } 5208 + 5170 5209 /* Can only return 0 or -ENOSPC */ 5171 5210 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5172 5211 struct inode *inode) ··· 6089 6092 6090 6093 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6091 6094 struct btrfs_root *root, 6092 - u64 bytenr, u64 num_bytes, u64 parent, 6095 + struct btrfs_delayed_ref_node *node, u64 parent, 6093 6096 u64 root_objectid, u64 owner_objectid, 6094 6097 u64 owner_offset, int refs_to_drop, 6095 - struct btrfs_delayed_extent_op *extent_op, 6096 - int no_quota) 6098 + struct btrfs_delayed_extent_op *extent_op) 6097 6099 { 6098 6100 struct btrfs_key key; 6099 6101 struct btrfs_path *path; ··· 6106 6110 int extent_slot = 0; 6107 6111 int found_extent = 0; 6108 6112 int num_to_del = 1; 6113 + int no_quota = node->no_quota; 6109 6114 u32 item_size; 6110 6115 u64 refs; 6116 + u64 bytenr = node->bytenr; 6117 + u64 num_bytes = node->num_bytes; 6111 6118 int last_ref = 0; 6112 - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 6113 6119 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6114 6120 SKINNY_METADATA); 6115 6121 ··· 6292 6294 refs -= refs_to_drop; 6293 6295 6294 6296 if (refs > 0) { 6295 - type = BTRFS_QGROUP_OPER_SUB_SHARED; 6296 6297 if (extent_op) 6297 6298 __run_delayed_extent_op(extent_op, leaf, ei); 6298 6299 /* ··· 6353 6356 } 6354 6357 btrfs_release_path(path); 6355 6358 6356 - /* Deal with the quota accounting */ 6357 - if (!ret && last_ref && !no_quota) { 6358 - int mod_seq = 0; 6359 - 6360 - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6361 - type == BTRFS_QGROUP_OPER_SUB_SHARED) 6362 - mod_seq = 1; 6363 - 6364 - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6365 - bytenr, num_bytes, type, 6366 - mod_seq); 6367 - } 6368 6359 out: 6369 6360 btrfs_free_path(path); 6370 6361 return ret; ··· 6378 6393 goto out_delayed_unlock; 6379 6394 6380 6395 spin_lock(&head->lock); 6381 - if (rb_first(&head->ref_root)) 6396 + if (!list_empty(&head->ref_list)) 6382 6397 goto out; 6383 6398 6384 6399 if (head->extent_op) { ··· 7288 7303 btrfs_mark_buffer_dirty(path->nodes[0]); 7289 7304 btrfs_free_path(path); 7290 7305 7291 - /* Always set parent to 0 here since its exclusive anyway. */ 7292 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7293 - ins->objectid, ins->offset, 7294 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7295 - if (ret) 7296 - return ret; 7297 - 7298 7306 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 7299 7307 if (ret) { /* -ENOENT, logic error */ 7300 7308 btrfs_err(fs_info, "update block group failed for %llu %llu", ··· 7368 7390 7369 7391 btrfs_mark_buffer_dirty(leaf); 7370 7392 btrfs_free_path(path); 7371 - 7372 - if (!no_quota) { 7373 - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7374 - ins->objectid, num_bytes, 7375 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7376 - if (ret) 7377 - return ret; 7378 - } 7379 7393 7380 7394 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 7381 7395 1); ··· 7725 7755 wc->reada_slot = slot; 7726 7756 } 7727 7757 7758 + /* 7759 + * TODO: Modify related function to add related node/leaf to dirty_extent_root, 7760 + * for later qgroup accounting. 7761 + * 7762 + * Current, this function does nothing. 7763 + */ 7728 7764 static int account_leaf_items(struct btrfs_trans_handle *trans, 7729 7765 struct btrfs_root *root, 7730 7766 struct extent_buffer *eb) 7731 7767 { 7732 7768 int nr = btrfs_header_nritems(eb); 7733 - int i, extent_type, ret; 7769 + int i, extent_type; 7734 7770 struct btrfs_key key; 7735 7771 struct btrfs_file_extent_item *fi; 7736 7772 u64 bytenr, num_bytes; ··· 7759 7783 continue; 7760 7784 7761 7785 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 7762 - 7763 - ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7764 - root->objectid, 7765 - bytenr, num_bytes, 7766 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); 7767 - if (ret) 7768 - return ret; 7769 7786 } 7770 7787 return 0; 7771 7788 } ··· 7827 7858 7828 7859 /* 7829 7860 * root_eb is the subtree root and is locked before this function is called. 7861 + * TODO: Modify this function to mark all (including complete shared node) 7862 + * to dirty_extent_root to allow it get accounted in qgroup. 7830 7863 */ 7831 7864 static int account_shared_subtree(struct btrfs_trans_handle *trans, 7832 7865 struct btrfs_root *root, ··· 7891 7920 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7892 7921 7893 7922 eb = read_tree_block(root, child_bytenr, child_gen); 7894 - if (!eb || !extent_buffer_uptodate(eb)) { 7923 + if (IS_ERR(eb)) { 7924 + ret = PTR_ERR(eb); 7925 + goto out; 7926 + } else if (!extent_buffer_uptodate(eb)) { 7927 + free_extent_buffer(eb); 7895 7928 ret = -EIO; 7896 7929 goto out; 7897 7930 } ··· 7906 7931 btrfs_tree_read_lock(eb); 7907 7932 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 7908 7933 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 7909 - 7910 - ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7911 - root->objectid, 7912 - child_bytenr, 7913 - root->nodesize, 7914 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 7915 - 0); 7916 - if (ret) 7917 - goto out; 7918 - 7919 7934 } 7920 7935 7921 7936 if (level == 0) { ··· 8116 8151 if (reada && level == 1) 8117 8152 reada_walk_down(trans, root, wc, path); 8118 8153 next = read_tree_block(root, bytenr, generation); 8119 - if (!next || !extent_buffer_uptodate(next)) { 8154 + if (IS_ERR(next)) { 8155 + return PTR_ERR(next); 8156 + } else if (!extent_buffer_uptodate(next)) { 8120 8157 free_extent_buffer(next); 8121 8158 return -EIO; 8122 8159 } ··· 8500 8533 goto out_end_trans; 8501 8534 } 8502 8535 8503 - /* 8504 - * Qgroup update accounting is run from 8505 - * delayed ref handling. This usually works 8506 - * out because delayed refs are normally the 8507 - * only way qgroup updates are added. However, 8508 - * we may have added updates during our tree 8509 - * walk so run qgroups here to make sure we 8510 - * don't lose any updates. 8511 - */ 8512 - ret = btrfs_delayed_qgroup_accounting(trans, 8513 - root->fs_info); 8514 - if (ret) 8515 - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8516 - "running qgroup updates " 8517 - "during snapshot delete. " 8518 - "Quota is out of sync, " 8519 - "rescan required.\n", ret); 8520 - 8521 8536 btrfs_end_transaction_throttle(trans, tree_root); 8522 8537 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8523 8538 pr_debug("BTRFS: drop snapshot early exit\n"); ··· 8553 8604 } 8554 8605 root_dropped = true; 8555 8606 out_end_trans: 8556 - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); 8557 - if (ret) 8558 - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8559 - "running qgroup updates " 8560 - "during snapshot delete. " 8561 - "Quota is out of sync, " 8562 - "rescan required.\n", ret); 8563 - 8564 8607 btrfs_end_transaction_throttle(trans, tree_root); 8565 8608 out_free: 8566 8609 kfree(wc); ··· 9503 9562 9504 9563 free_excluded_extents(root, cache); 9505 9564 9565 + /* 9566 + * Call to ensure the corresponding space_info object is created and 9567 + * assigned to our block group, but don't update its counters just yet. 9568 + * We want our bg to be added to the rbtree with its ->space_info set. 9569 + */ 9570 + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 9571 + &cache->space_info); 9572 + if (ret) { 9573 + btrfs_remove_free_space_cache(cache); 9574 + btrfs_put_block_group(cache); 9575 + return ret; 9576 + } 9577 + 9506 9578 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9507 9579 if (ret) { 9508 9580 btrfs_remove_free_space_cache(cache); ··· 9523 9569 return ret; 9524 9570 } 9525 9571 9572 + /* 9573 + * Now that our block group has its ->space_info set and is inserted in 9574 + * the rbtree, update the space info's counters. 9575 + */ 9526 9576 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 9527 9577 &cache->space_info); 9528 9578 if (ret) {
fs/btrfs/extent-tree.h
+8 -1
fs/btrfs/extent_io.c
··· 1277 1277 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1278 1278 unsigned bits, gfp_t mask) 1279 1279 { 1280 - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1280 + int wake = 0; 1281 + 1282 + if (bits & EXTENT_LOCKED) 1283 + wake = 1; 1284 + 1285 + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); 1281 1286 } 1282 1287 1283 1288 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, ··· 4495 4490 } 4496 4491 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4497 4492 flags |= FIEMAP_EXTENT_ENCODED; 4493 + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4494 + flags |= FIEMAP_EXTENT_UNWRITTEN; 4498 4495 4499 4496 free_extent_map(em); 4500 4497 em = NULL;
+6 -3
fs/btrfs/file.c
··· 1868 1868 struct btrfs_log_ctx ctx; 1869 1869 int ret = 0; 1870 1870 bool full_sync = 0; 1871 + const u64 len = end - start + 1; 1871 1872 1872 1873 trace_btrfs_sync_file(file, datasync); 1873 1874 ··· 1897 1896 * all extents are persisted and the respective file extent 1898 1897 * items are in the fs/subvol btree. 1899 1898 */ 1900 - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1899 + ret = btrfs_wait_ordered_range(inode, start, len); 1901 1900 } else { 1902 1901 /* 1903 1902 * Start any new ordered operations before starting to log the ··· 1969 1968 */ 1970 1969 smp_mb(); 1971 1970 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1972 - (full_sync && BTRFS_I(inode)->last_trans <= 1973 - root->fs_info->last_trans_committed)) { 1971 + (BTRFS_I(inode)->last_trans <= 1972 + root->fs_info->last_trans_committed && 1973 + (full_sync || 1974 + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { 1974 1975 /* 1975 1976 * We'v had everything committed since the last time we were 1976 1977 * modified so clear this flag in case it was set for whatever
+6 -8
fs/btrfs/free-space-cache.c
··· 231 231 { 232 232 int ret = 0; 233 233 struct btrfs_path *path = btrfs_alloc_path(); 234 + bool locked = false; 234 235 235 236 if (!path) { 236 237 ret = -ENOMEM; ··· 239 238 } 240 239 241 240 if (block_group) { 241 + locked = true; 242 242 mutex_lock(&trans->transaction->cache_write_mutex); 243 243 if (!list_empty(&block_group->io_list)) { 244 244 list_del_init(&block_group->io_list); ··· 271 269 */ 272 270 ret = btrfs_truncate_inode_items(trans, root, inode, 273 271 0, BTRFS_EXTENT_DATA_KEY); 274 - if (ret) { 275 - mutex_unlock(&trans->transaction->cache_write_mutex); 276 - btrfs_abort_transaction(trans, root, ret); 277 - return ret; 278 - } 272 + if (ret) 273 + goto fail; 279 274 280 275 ret = btrfs_update_inode(trans, root, inode); 281 276 282 - if (block_group) 283 - mutex_unlock(&trans->transaction->cache_write_mutex); 284 - 285 277 fail: 278 + if (locked) 279 + mutex_unlock(&trans->transaction->cache_write_mutex); 286 280 if (ret) 287 281 btrfs_abort_transaction(trans, root, ret); 288 282
+21 -5
fs/btrfs/inode.c
··· 4986 4986 } 4987 4987 write_unlock(&map_tree->lock); 4988 4988 4989 + /* 4990 + * Keep looping until we have no more ranges in the io tree. 4991 + * We can have ongoing bios started by readpages (called from readahead) 4992 + * that didn't get their end io callbacks called yet or they are still 4993 + * in progress ((extent_io.c:end_bio_extent_readpage()). This means some 4994 + * ranges can still be locked and eviction started because before 4995 + * submitting those bios, which are executed by a separate task (work 4996 + * queue kthread), inode references (inode->i_count) were not taken 4997 + * (which would be dropped in the end io callback of each bio). 4998 + * Therefore here we effectively end up waiting for those bios and 4999 + * anyone else holding locked ranges without having bumped the inode's 5000 + * reference count - if we don't do it, when they access the inode's 5001 + * io_tree to unlock a range it may be too late, leading to an 5002 + * use-after-free issue. 5003 + */ 4989 5004 spin_lock(&io_tree->lock); 4990 5005 while (!RB_EMPTY_ROOT(&io_tree->state)) { 4991 5006 struct extent_state *state; 4992 5007 struct extent_state *cached_state = NULL; 5008 + u64 start; 5009 + u64 end; 4993 5010 4994 5011 node = rb_first(&io_tree->state); 4995 5012 state = rb_entry(node, struct extent_state, rb_node); 4996 - atomic_inc(&state->refs); 5013 + start = state->start; 5014 + end = state->end; 4997 5015 spin_unlock(&io_tree->lock); 4998 5016 4999 - lock_extent_bits(io_tree, state->start, state->end, 5000 - 0, &cached_state); 5001 - clear_extent_bit(io_tree, state->start, state->end, 5017 + lock_extent_bits(io_tree, start, end, 0, &cached_state); 5018 + clear_extent_bit(io_tree, start, end, 5002 5019 EXTENT_LOCKED | EXTENT_DIRTY | 5003 5020 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5004 5021 EXTENT_DEFRAG, 1, 1, 5005 5022 &cached_state, GFP_NOFS); 5006 - free_extent_state(state); 5007 5023 5008 5024 cond_resched(); 5009 5025 spin_lock(&io_tree->lock);
+34 -16
fs/btrfs/ioctl.c
··· 553 553 key.offset = (u64)-1; 554 554 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); 555 555 if (IS_ERR(new_root)) { 556 - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); 557 556 ret = PTR_ERR(new_root); 557 + btrfs_abort_transaction(trans, root, ret); 558 558 goto fail; 559 559 } 560 560 ··· 1318 1318 i = range->start >> PAGE_CACHE_SHIFT; 1319 1319 } 1320 1320 if (!max_to_defrag) 1321 - max_to_defrag = last_index + 1; 1321 + max_to_defrag = last_index - i + 1; 1322 1322 1323 1323 /* 1324 1324 * make writeback starts from i, so the defrag range can be ··· 1368 1368 ra_index = max(i, ra_index); 1369 1369 btrfs_force_ra(inode->i_mapping, ra, file, ra_index, 1370 1370 cluster); 1371 - ra_index += max_cluster; 1371 + ra_index += cluster; 1372 1372 } 1373 1373 1374 1374 mutex_lock(&inode->i_mutex); ··· 2271 2271 { 2272 2272 struct btrfs_ioctl_ino_lookup_args *args; 2273 2273 struct inode *inode; 2274 - int ret; 2275 - 2276 - if (!capable(CAP_SYS_ADMIN)) 2277 - return -EPERM; 2274 + int ret = 0; 2278 2275 2279 2276 args = memdup_user(argp, sizeof(*args)); 2280 2277 if (IS_ERR(args)) ··· 2279 2282 2280 2283 inode = file_inode(file); 2281 2284 2285 + /* 2286 + * Unprivileged query to obtain the containing subvolume root id. The 2287 + * path is reset so it's consistent with btrfs_search_path_in_tree. 2288 + */ 2282 2289 if (args->treeid == 0) 2283 2290 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2291 + 2292 + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { 2293 + args->name[0] = 0; 2294 + goto out; 2295 + } 2296 + 2297 + if (!capable(CAP_SYS_ADMIN)) { 2298 + ret = -EPERM; 2299 + goto out; 2300 + } 2284 2301 2285 2302 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2286 2303 args->treeid, args->objectid, 2287 2304 args->name); 2288 2305 2306 + out: 2289 2307 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2290 2308 ret = -EFAULT; 2291 2309 ··· 2425 2413 goto out_unlock_inode; 2426 2414 } 2427 2415 2428 - d_invalidate(dentry); 2429 - 2430 2416 down_write(&root->fs_info->subvol_sem); 2431 2417 2432 2418 err = may_destroy_subvol(dest); ··· 2518 2508 out_unlock_inode: 2519 2509 mutex_unlock(&inode->i_mutex); 2520 2510 if (!err) { 2521 - shrink_dcache_sb(root->fs_info->sb); 2511 + d_invalidate(dentry); 2522 2512 btrfs_invalidate_inodes(dest); 2523 2513 d_delete(dentry); 2524 2514 ASSERT(dest->send_in_progress == 0); ··· 2889 2879 return ret; 2890 2880 } 2891 2881 2892 - static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) 2882 + static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, 2883 + u64 olen) 2893 2884 { 2885 + u64 len = *plen; 2894 2886 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 2895 2887 2896 - if (off + len > inode->i_size || off + len < off) 2888 + if (off + olen > inode->i_size || off + olen < off) 2897 2889 return -EINVAL; 2890 + 2891 + /* if we extend to eof, continue to block boundary */ 2892 + if (off + len == inode->i_size) 2893 + *plen = len = ALIGN(inode->i_size, bs) - off; 2894 + 2898 2895 /* Check that we are block aligned - btrfs_clone() requires this */ 2899 2896 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 2900 2897 return -EINVAL; ··· 2909 2892 return 0; 2910 2893 } 2911 2894 2912 - static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, 2895 + static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 2913 2896 struct inode *dst, u64 dst_loff) 2914 2897 { 2915 2898 int ret; 2899 + u64 len = olen; 2916 2900 2917 2901 /* 2918 2902 * btrfs_clone() can't handle extents in the same file ··· 2928 2910 2929 2911 btrfs_double_lock(src, loff, dst, dst_loff, len); 2930 2912 2931 - ret = extent_same_check_offsets(src, loff, len); 2913 + ret = extent_same_check_offsets(src, loff, &len, olen); 2932 2914 if (ret) 2933 2915 goto out_unlock; 2934 2916 2935 - ret = extent_same_check_offsets(dst, dst_loff, len); 2917 + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); 2936 2918 if (ret) 2937 2919 goto out_unlock; 2938 2920 ··· 2945 2927 2946 2928 ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); 2947 2929 if (ret == 0) 2948 - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); 2930 + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); 2949 2931 2950 2932 out_unlock: 2951 2933 btrfs_double_unlock(src, loff, dst, dst_loff, len);
+29 -8
fs/btrfs/ordered-data.c
··· 198 198 entry->file_offset = file_offset; 199 199 entry->start = start; 200 200 entry->len = len; 201 - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && 202 - !(type == BTRFS_ORDERED_NOCOW)) 203 - entry->csum_bytes_left = disk_len; 204 201 entry->disk_len = disk_len; 205 202 entry->bytes_left = len; 206 203 entry->inode = igrab(inode); ··· 283 286 tree = &BTRFS_I(inode)->ordered_tree; 284 287 spin_lock_irq(&tree->lock); 285 288 list_add_tail(&sum->list, &entry->list); 286 - WARN_ON(entry->csum_bytes_left < sum->len); 287 - entry->csum_bytes_left -= sum->len; 288 - if (entry->csum_bytes_left == 0) 289 - wake_up(&entry->wait); 290 289 spin_unlock_irq(&tree->lock); 291 290 } 292 291 ··· 502 509 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 503 510 &ordered->flags)); 504 511 505 - list_add_tail(&ordered->trans_list, &trans->ordered); 512 + /* 513 + * If our ordered extent completed it means it updated the 514 + * fs/subvol and csum trees already, so no need to make the 515 + * current transaction's commit wait for it, as we end up 516 + * holding memory unnecessarily and delaying the inode's iput 517 + * until the transaction commit (we schedule an iput for the 518 + * inode when the ordered extent's refcount drops to 0), which 519 + * prevents it from being evictable until the transaction 520 + * commits. 521 + */ 522 + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) 523 + btrfs_put_ordered_extent(ordered); 524 + else 525 + list_add_tail(&ordered->trans_list, &trans->ordered); 526 + 506 527 spin_lock_irq(&log->log_extents_lock[index]); 507 528 } 508 529 spin_unlock_irq(&log->log_extents_lock[index]); ··· 849 842 atomic_inc(&entry->refs); 850 843 spin_unlock_irq(&tree->lock); 851 844 return entry; 845 + } 846 + 847 + bool btrfs_have_ordered_extents_in_range(struct inode *inode, 848 + u64 file_offset, 849 + u64 len) 850 + { 851 + struct btrfs_ordered_extent *oe; 852 + 853 + oe = btrfs_lookup_ordered_range(inode, file_offset, len); 854 + if (oe) { 855 + btrfs_put_ordered_extent(oe); 856 + return true; 857 + } 858 + return false; 852 859 } 853 860 854 861 /*
+3 -3
fs/btrfs/ordered-data.h
··· 89 89 /* number of bytes that still need writing */ 90 90 u64 bytes_left; 91 91 92 - /* number of bytes that still need csumming */ 93 - u64 csum_bytes_left; 94 - 95 92 /* 96 93 * the end of the ordered extent which is behind it but 97 94 * didn't update disk_i_size. Please see the comment of ··· 188 191 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, 189 192 u64 file_offset, 190 193 u64 len); 194 + bool btrfs_have_ordered_extents_in_range(struct inode *inode, 195 + u64 file_offset, 196 + u64 len); 191 197 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 192 198 struct btrfs_ordered_extent *ordered); 193 199 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+286 -802
fs/btrfs/qgroup.c
··· 34 34 #include "extent_io.h" 35 35 #include "qgroup.h" 36 36 37 + 37 38 /* TODO XXX FIXME 38 39 * - subvol delete -> delete when ref goes to 0? delete limits also? 39 40 * - reorganize keys ··· 85 84 86 85 /* 87 86 * temp variables for accounting operations 87 + * Refer to qgroup_shared_accouting() for details. 88 88 */ 89 89 u64 old_refcnt; 90 90 u64 new_refcnt; 91 91 }; 92 + 93 + static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 94 + int mod) 95 + { 96 + if (qg->old_refcnt < seq) 97 + qg->old_refcnt = seq; 98 + qg->old_refcnt += mod; 99 + } 100 + 101 + static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 102 + int mod) 103 + { 104 + if (qg->new_refcnt < seq) 105 + qg->new_refcnt = seq; 106 + qg->new_refcnt += mod; 107 + } 108 + 109 + static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 110 + { 111 + if (qg->old_refcnt < seq) 112 + return 0; 113 + return qg->old_refcnt - seq; 114 + } 115 + 116 + static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 117 + { 118 + if (qg->new_refcnt < seq) 119 + return 0; 120 + return qg->new_refcnt - seq; 121 + } 92 122 93 123 /* 94 124 * glue structure to represent the relations between qgroups. ··· 1147 1115 struct ulist *tmp; 1148 1116 int ret = 0; 1149 1117 1150 - tmp = ulist_alloc(GFP_NOFS); 1151 - if (!tmp) 1152 - return -ENOMEM; 1153 - 1154 1118 /* Check the level of src and dst first */ 1155 1119 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1156 1120 return -EINVAL; 1121 + 1122 + tmp = ulist_alloc(GFP_NOFS); 1123 + if (!tmp) 1124 + return -ENOMEM; 1157 1125 1158 1126 mutex_lock(&fs_info->qgroup_ioctl_lock); 1159 1127 quota_root = fs_info->quota_root; ··· 1388 1356 return ret; 1389 1357 } 1390 1358 1391 - static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, 1392 - struct btrfs_qgroup_operation *oper2) 1359 + int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 1360 + struct btrfs_fs_info *fs_info) 1393 1361 { 1394 - /* 1395 - * Ignore seq and type here, we're looking for any operation 1396 - * at all related to this extent on that root. 1397 - */ 1398 - if (oper1->bytenr < oper2->bytenr) 1399 - return -1; 1400 - if (oper1->bytenr > oper2->bytenr) 1401 - return 1; 1402 - if (oper1->ref_root < oper2->ref_root) 1403 - return -1; 1404 - if (oper1->ref_root > oper2->ref_root) 1405 - return 1; 1406 - return 0; 1407 - } 1408 - 1409 - static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, 1410 - struct btrfs_qgroup_operation *oper) 1411 - { 1412 - struct rb_node *n; 1413 - struct btrfs_qgroup_operation *cur; 1414 - int cmp; 1415 - 1416 - spin_lock(&fs_info->qgroup_op_lock); 1417 - n = fs_info->qgroup_op_tree.rb_node; 1418 - while (n) { 1419 - cur = rb_entry(n, struct btrfs_qgroup_operation, n); 1420 - cmp = comp_oper_exist(cur, oper); 1421 - if (cmp < 0) { 1422 - n = n->rb_right; 1423 - } else if (cmp) { 1424 - n = n->rb_left; 1425 - } else { 1426 - spin_unlock(&fs_info->qgroup_op_lock); 1427 - return -EEXIST; 1428 - } 1429 - } 1430 - spin_unlock(&fs_info->qgroup_op_lock); 1431 - return 0; 1432 - } 1433 - 1434 - static int comp_oper(struct btrfs_qgroup_operation *oper1, 1435 - struct btrfs_qgroup_operation *oper2) 1436 - { 1437 - if (oper1->bytenr < oper2->bytenr) 1438 - return -1; 1439 - if (oper1->bytenr > oper2->bytenr) 1440 - return 1; 1441 - if (oper1->ref_root < oper2->ref_root) 1442 - return -1; 1443 - if (oper1->ref_root > oper2->ref_root) 1444 - return 1; 1445 - if (oper1->seq < oper2->seq) 1446 - return -1; 1447 - if (oper1->seq > oper2->seq) 1448 - return 1; 1449 - if (oper1->type < oper2->type) 1450 - return -1; 1451 - if (oper1->type > oper2->type) 1452 - return 1; 1453 - return 0; 1454 - } 1455 - 1456 - static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, 1457 - struct btrfs_qgroup_operation *oper) 1458 - { 1459 - struct rb_node **p; 1460 - struct rb_node *parent = NULL; 1461 - struct btrfs_qgroup_operation *cur; 1462 - int cmp; 1463 - 1464 - spin_lock(&fs_info->qgroup_op_lock); 1465 - p = &fs_info->qgroup_op_tree.rb_node; 1466 - while (*p) { 1467 - parent = *p; 1468 - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); 1469 - cmp = comp_oper(cur, oper); 1470 - if (cmp < 0) { 1471 - p = &(*p)->rb_right; 1472 - } else if (cmp) { 1473 - p = &(*p)->rb_left; 1474 - } else { 1475 - spin_unlock(&fs_info->qgroup_op_lock); 1476 - return -EEXIST; 1477 - } 1478 - } 1479 - rb_link_node(&oper->n, parent, p); 1480 - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); 1481 - spin_unlock(&fs_info->qgroup_op_lock); 1482 - return 0; 1483 - } 1484 - 1485 - /* 1486 - * Record a quota operation for processing later on. 1487 - * @trans: the transaction we are adding the delayed op to. 1488 - * @fs_info: the fs_info for this fs. 1489 - * @ref_root: the root of the reference we are acting on, 1490 - * @bytenr: the bytenr we are acting on. 1491 - * @num_bytes: the number of bytes in the reference. 1492 - * @type: the type of operation this is. 1493 - * @mod_seq: do we need to get a sequence number for looking up roots. 1494 - * 1495 - * We just add it to our trans qgroup_ref_list and carry on and process these 1496 - * operations in order at some later point. If the reference root isn't a fs 1497 - * root then we don't bother with doing anything. 1498 - * 1499 - * MUST BE HOLDING THE REF LOCK. 1500 - */ 1501 - int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 1502 - struct btrfs_fs_info *fs_info, u64 ref_root, 1503 - u64 bytenr, u64 num_bytes, 1504 - enum btrfs_qgroup_operation_type type, int mod_seq) 1505 - { 1506 - struct btrfs_qgroup_operation *oper; 1507 - int ret; 1508 - 1509 - if (!is_fstree(ref_root) || !fs_info->quota_enabled) 1510 - return 0; 1511 - 1512 - oper = kmalloc(sizeof(*oper), GFP_NOFS); 1513 - if (!oper) 1514 - return -ENOMEM; 1515 - 1516 - oper->ref_root = ref_root; 1517 - oper->bytenr = bytenr; 1518 - oper->num_bytes = num_bytes; 1519 - oper->type = type; 1520 - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1521 - INIT_LIST_HEAD(&oper->elem.list); 1522 - oper->elem.seq = 0; 1523 - 1524 - trace_btrfs_qgroup_record_ref(oper); 1525 - 1526 - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { 1527 - /* 1528 - * If any operation for this bytenr/ref_root combo 1529 - * exists, then we know it's not exclusively owned and 1530 - * shouldn't be queued up. 1531 - * 1532 - * This also catches the case where we have a cloned 1533 - * extent that gets queued up multiple times during 1534 - * drop snapshot. 1535 - */ 1536 - if (qgroup_oper_exists(fs_info, oper)) { 1537 - kfree(oper); 1538 - return 0; 1539 - } 1540 - } 1541 - 1542 - ret = insert_qgroup_oper(fs_info, oper); 1543 - if (ret) { 1544 - /* Shouldn't happen so have an assert for developers */ 1545 - ASSERT(0); 1546 - kfree(oper); 1547 - return ret; 1548 - } 1549 - list_add_tail(&oper->list, &trans->qgroup_ref_list); 1550 - 1551 - if (mod_seq) 1552 - btrfs_get_tree_mod_seq(fs_info, &oper->elem); 1553 - 1554 - return 0; 1555 - } 1556 - 1557 - static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1558 - struct btrfs_qgroup_operation *oper) 1559 - { 1560 - struct ulist *tmp; 1561 - int sign = 0; 1362 + struct btrfs_qgroup_extent_record *record; 1363 + struct btrfs_delayed_ref_root *delayed_refs; 1364 + struct rb_node *node; 1365 + u64 qgroup_to_skip; 1562 1366 int ret = 0; 1563 1367 1564 - tmp = ulist_alloc(GFP_NOFS); 1565 - if (!tmp) 1566 - return -ENOMEM; 1368 + delayed_refs = &trans->transaction->delayed_refs; 1369 + qgroup_to_skip = delayed_refs->qgroup_to_skip; 1567 1370 1568 - spin_lock(&fs_info->qgroup_lock); 1569 - if (!fs_info->quota_root) 1570 - goto out; 1571 - 1572 - switch (oper->type) { 1573 - case BTRFS_QGROUP_OPER_ADD_EXCL: 1574 - sign = 1; 1575 - break; 1576 - case BTRFS_QGROUP_OPER_SUB_EXCL: 1577 - sign = -1; 1578 - break; 1579 - default: 1580 - ASSERT(0); 1371 + /* 1372 + * No need to do lock, since this function will only be called in 1373 + * btrfs_commmit_transaction(). 1374 + */ 1375 + node = rb_first(&delayed_refs->dirty_extent_root); 1376 + while (node) { 1377 + record = rb_entry(node, struct btrfs_qgroup_extent_record, 1378 + node); 1379 + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, 1380 + &record->old_roots); 1381 + if (ret < 0) 1382 + break; 1383 + if (qgroup_to_skip) 1384 + ulist_del(record->old_roots, qgroup_to_skip, 0); 1385 + node = rb_next(node); 1581 1386 } 1582 - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, 1583 - oper->num_bytes, sign); 1584 - out: 1585 - spin_unlock(&fs_info->qgroup_lock); 1586 - ulist_free(tmp); 1587 1387 return ret; 1588 1388 } 1589 1389 1390 + struct btrfs_qgroup_extent_record 1391 + *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, 1392 + struct btrfs_qgroup_extent_record *record) 1393 + { 1394 + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1395 + struct rb_node *parent_node = NULL; 1396 + struct btrfs_qgroup_extent_record *entry; 1397 + u64 bytenr = record->bytenr; 1398 + 1399 + while (*p) { 1400 + parent_node = *p; 1401 + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1402 + node); 1403 + if (bytenr < entry->bytenr) 1404 + p = &(*p)->rb_left; 1405 + else if (bytenr > entry->bytenr) 1406 + p = &(*p)->rb_right; 1407 + else 1408 + return entry; 1409 + } 1410 + 1411 + rb_link_node(&record->node, parent_node, p); 1412 + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1413 + return NULL; 1414 + } 1415 + 1416 + #define UPDATE_NEW 0 1417 + #define UPDATE_OLD 1 1590 1418 /* 1591 - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as 1592 - * properly. 1419 + * Walk all of the roots that points to the bytenr and adjust their refcnts. 1593 1420 */ 1594 - static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, 1595 - u64 root_to_skip, struct ulist *tmp, 1596 - struct ulist *roots, struct ulist *qgroups, 1597 - u64 seq, int *old_roots, int rescan) 1421 + static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 1422 + struct ulist *roots, struct ulist *tmp, 1423 + struct ulist *qgroups, u64 seq, int update_old) 1598 1424 { 1599 1425 struct ulist_node *unode; 1600 1426 struct ulist_iterator uiter; 1601 1427 struct ulist_node *tmp_unode; 1602 1428 struct ulist_iterator tmp_uiter; 1603 1429 struct btrfs_qgroup *qg; 1604 - int ret; 1430 + int ret = 0; 1605 1431 1432 + if (!roots) 1433 + return 0; 1606 1434 ULIST_ITER_INIT(&uiter); 1607 1435 while ((unode = ulist_next(roots, &uiter))) { 1608 - /* We don't count our current root here */ 1609 - if (unode->val == root_to_skip) 1610 - continue; 1611 1436 qg = find_qgroup_rb(fs_info, unode->val); 1612 1437 if (!qg) 1613 1438 continue; 1614 - /* 1615 - * We could have a pending removal of this same ref so we may 1616 - * not have actually found our ref root when doing 1617 - * btrfs_find_all_roots, so we need to keep track of how many 1618 - * old roots we find in case we removed ours and added a 1619 - * different one at the same time. I don't think this could 1620 - * happen in practice but that sort of thinking leads to pain 1621 - * and suffering and to the dark side. 1622 - */ 1623 - (*old_roots)++; 1624 1439 1625 1440 ulist_reinit(tmp); 1626 1441 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), ··· 1482 1603 struct btrfs_qgroup_list *glist; 1483 1604 1484 1605 qg = u64_to_ptr(tmp_unode->aux); 1485 - /* 1486 - * We use this sequence number to keep from having to 1487 - * run the whole list and 0 out the refcnt every time. 1488 - * We basically use sequnce as the known 0 count and 1489 - * then add 1 everytime we see a qgroup. This is how we 1490 - * get how many of the roots actually point up to the 1491 - * upper level qgroups in order to determine exclusive 1492 - * counts. 1493 - * 1494 - * For rescan we want to set old_refcnt to seq so our 1495 - * exclusive calculations end up correct. 1496 - */ 1497 - if (rescan) 1498 - qg->old_refcnt = seq; 1499 - else if (qg->old_refcnt < seq) 1500 - qg->old_refcnt = seq + 1; 1606 + if (update_old) 1607 + btrfs_qgroup_update_old_refcnt(qg, seq, 1); 1501 1608 else 1502 - qg->old_refcnt++; 1503 - 1504 - if (qg->new_refcnt < seq) 1505 - qg->new_refcnt = seq + 1; 1506 - else 1507 - qg->new_refcnt++; 1609 + btrfs_qgroup_update_new_refcnt(qg, seq, 1); 1508 1610 list_for_each_entry(glist, &qg->groups, next_group) { 1509 1611 ret = ulist_add(qgroups, glist->group->qgroupid, 1510 1612 ptr_to_u64(glist->group), ··· 1504 1644 } 1505 1645 1506 1646 /* 1507 - * We need to walk forward in our operation tree and account for any roots that 1508 - * were deleted after we made this operation. 1647 + * Update qgroup rfer/excl counters. 1648 + * Rfer update is easy, codes can explain themselves. 1649 + * 1650 + * Excl update is tricky, the update is split into 2 part. 1651 + * Part 1: Possible exclusive <-> sharing detect: 1652 + * | A | !A | 1653 + * ------------------------------------- 1654 + * B | * | - | 1655 + * ------------------------------------- 1656 + * !B | + | ** | 1657 + * ------------------------------------- 1658 + * 1659 + * Conditions: 1660 + * A: cur_old_roots < nr_old_roots (not exclusive before) 1661 + * !A: cur_old_roots == nr_old_roots (possible exclusive before) 1662 + * B: cur_new_roots < nr_new_roots (not exclusive now) 1663 + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) 1664 + * 1665 + * Results: 1666 + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 1667 + * *: Definitely not changed. **: Possible unchanged. 1668 + * 1669 + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 1670 + * 1671 + * To make the logic clear, we first use condition A and B to split 1672 + * combination into 4 results. 1673 + * 1674 + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 1675 + * only on variant maybe 0. 1676 + * 1677 + * Lastly, check result **, since there are 2 variants maybe 0, split them 1678 + * again(2x2). 1679 + * But this time we don't need to consider other things, the codes and logic 1680 + * is easy to understand now. 1509 1681 */ 1510 - static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, 1511 - struct btrfs_qgroup_operation *oper, 1512 - struct ulist *tmp, 1513 - struct ulist *qgroups, u64 seq, 1514 - int *old_roots) 1515 - { 1516 - struct ulist_node *unode; 1517 - struct ulist_iterator uiter; 1518 - struct btrfs_qgroup *qg; 1519 - struct btrfs_qgroup_operation *tmp_oper; 1520 - struct rb_node *n; 1521 - int ret; 1522 - 1523 - ulist_reinit(tmp); 1524 - 1525 - /* 1526 - * We only walk forward in the tree since we're only interested in 1527 - * removals that happened _after_ our operation. 1528 - */ 1529 - spin_lock(&fs_info->qgroup_op_lock); 1530 - n = rb_next(&oper->n); 1531 - spin_unlock(&fs_info->qgroup_op_lock); 1532 - if (!n) 1533 - return 0; 1534 - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); 1535 - while (tmp_oper->bytenr == oper->bytenr) { 1536 - /* 1537 - * If it's not a removal we don't care, additions work out 1538 - * properly with our refcnt tracking. 1539 - */ 1540 - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && 1541 - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) 1542 - goto next; 1543 - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); 1544 - if (!qg) 1545 - goto next; 1546 - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), 1547 - GFP_ATOMIC); 1548 - if (ret) { 1549 - if (ret < 0) 1550 - return ret; 1551 - /* 1552 - * We only want to increase old_roots if this qgroup is 1553 - * not already in the list of qgroups. If it is already 1554 - * there then that means it must have been re-added or 1555 - * the delete will be discarded because we had an 1556 - * existing ref that we haven't looked up yet. In this 1557 - * case we don't want to increase old_roots. So if ret 1558 - * == 1 then we know that this is the first time we've 1559 - * seen this qgroup and we can bump the old_roots. 1560 - */ 1561 - (*old_roots)++; 1562 - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), 1563 - GFP_ATOMIC); 1564 - if (ret < 0) 1565 - return ret; 1566 - } 1567 - next: 1568 - spin_lock(&fs_info->qgroup_op_lock); 1569 - n = rb_next(&tmp_oper->n); 1570 - spin_unlock(&fs_info->qgroup_op_lock); 1571 - if (!n) 1572 - break; 1573 - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); 1574 - } 1575 - 1576 - /* Ok now process the qgroups we found */ 1577 - ULIST_ITER_INIT(&uiter); 1578 - while ((unode = ulist_next(tmp, &uiter))) { 1579 - struct btrfs_qgroup_list *glist; 1580 - 1581 - qg = u64_to_ptr(unode->aux); 1582 - if (qg->old_refcnt < seq) 1583 - qg->old_refcnt = seq + 1; 1584 - else 1585 - qg->old_refcnt++; 1586 - if (qg->new_refcnt < seq) 1587 - qg->new_refcnt = seq + 1; 1588 - else 1589 - qg->new_refcnt++; 1590 - list_for_each_entry(glist, &qg->groups, next_group) { 1591 - ret = ulist_add(qgroups, glist->group->qgroupid, 1592 - ptr_to_u64(glist->group), GFP_ATOMIC); 1593 - if (ret < 0) 1594 - return ret; 1595 - ret = ulist_add(tmp, glist->group->qgroupid, 1596 - ptr_to_u64(glist->group), GFP_ATOMIC); 1597 - if (ret < 0) 1598 - return ret; 1599 - } 1600 - } 1601 - return 0; 1602 - } 1603 - 1604 - /* Add refcnt for the newly added reference. */ 1605 - static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, 1606 - struct btrfs_qgroup_operation *oper, 1607 - struct btrfs_qgroup *qgroup, 1608 - struct ulist *tmp, struct ulist *qgroups, 1609 - u64 seq) 1610 - { 1611 - struct ulist_node *unode; 1612 - struct ulist_iterator uiter; 1613 - struct btrfs_qgroup *qg; 1614 - int ret; 1615 - 1616 - ulist_reinit(tmp); 1617 - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), 1618 - GFP_ATOMIC); 1619 - if (ret < 0) 1620 - return ret; 1621 - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), 1622 - GFP_ATOMIC); 1623 - if (ret < 0) 1624 - return ret; 1625 - ULIST_ITER_INIT(&uiter); 1626 - while ((unode = ulist_next(tmp, &uiter))) { 1627 - struct btrfs_qgroup_list *glist; 1628 - 1629 - qg = u64_to_ptr(unode->aux); 1630 - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { 1631 - if (qg->new_refcnt < seq) 1632 - qg->new_refcnt = seq + 1; 1633 - else 1634 - qg->new_refcnt++; 1635 - } else { 1636 - if (qg->old_refcnt < seq) 1637 - qg->old_refcnt = seq + 1; 1638 - else 1639 - qg->old_refcnt++; 1640 - } 1641 - list_for_each_entry(glist, &qg->groups, next_group) { 1642 - ret = ulist_add(tmp, glist->group->qgroupid, 1643 - ptr_to_u64(glist->group), GFP_ATOMIC); 1644 - if (ret < 0) 1645 - return ret; 1646 - ret = ulist_add(qgroups, glist->group->qgroupid, 1647 - ptr_to_u64(glist->group), GFP_ATOMIC); 1648 - if (ret < 0) 1649 - return ret; 1650 - } 1651 - } 1652 - return 0; 1653 - } 1654 - 1655 - /* 1656 - * This adjusts the counters for all referenced qgroups if need be. 1657 - */ 1658 - static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, 1659 - u64 root_to_skip, u64 num_bytes, 1660 - struct ulist *qgroups, u64 seq, 1661 - int old_roots, int new_roots, int rescan) 1682 + static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 1683 + struct ulist *qgroups, 1684 + u64 nr_old_roots, 1685 + u64 nr_new_roots, 1686 + u64 num_bytes, u64 seq) 1662 1687 { 1663 1688 struct ulist_node *unode; 1664 1689 struct ulist_iterator uiter; ··· 1555 1810 bool dirty = false; 1556 1811 1557 1812 qg = u64_to_ptr(unode->aux); 1558 - /* 1559 - * Wasn't referenced before but is now, add to the reference 1560 - * counters. 1561 - */ 1562 - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { 1813 + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 1814 + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 1815 + 1816 + /* Rfer update part */ 1817 + if (cur_old_count == 0 && cur_new_count > 0) { 1563 1818 qg->rfer += num_bytes; 1564 1819 qg->rfer_cmpr += num_bytes; 1565 1820 dirty = true; 1566 1821 } 1567 - 1568 - /* 1569 - * Was referenced before but isn't now, subtract from the 1570 - * reference counters. 1571 - */ 1572 - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { 1822 + if (cur_old_count > 0 && cur_new_count == 0) { 1573 1823 qg->rfer -= num_bytes; 1574 1824 qg->rfer_cmpr -= num_bytes; 1575 1825 dirty = true; 1576 1826 } 1577 1827 1578 - if (qg->old_refcnt < seq) 1579 - cur_old_count = 0; 1580 - else 1581 - cur_old_count = qg->old_refcnt - seq; 1582 - if (qg->new_refcnt < seq) 1583 - cur_new_count = 0; 1584 - else 1585 - cur_new_count = qg->new_refcnt - seq; 1586 - 1587 - /* 1588 - * If our refcount was the same as the roots previously but our 1589 - * new count isn't the same as the number of roots now then we 1590 - * went from having a exclusive reference on this range to not. 1591 - */ 1592 - if (old_roots && cur_old_count == old_roots && 1593 - (cur_new_count != new_roots || new_roots == 0)) { 1594 - WARN_ON(cur_new_count != new_roots && new_roots == 0); 1595 - qg->excl -= num_bytes; 1596 - qg->excl_cmpr -= num_bytes; 1597 - dirty = true; 1828 + /* Excl update part */ 1829 + /* Exclusive/none -> shared case */ 1830 + if (cur_old_count == nr_old_roots && 1831 + cur_new_count < nr_new_roots) { 1832 + /* Exclusive -> shared */ 1833 + if (cur_old_count != 0) { 1834 + qg->excl -= num_bytes; 1835 + qg->excl_cmpr -= num_bytes; 1836 + dirty = true; 1837 + } 1598 1838 } 1599 1839 1600 - /* 1601 - * If we didn't reference all the roots before but now we do we 1602 - * have an exclusive reference to this range. 1603 - */ 1604 - if ((!old_roots || (old_roots && cur_old_count != old_roots)) 1605 - && cur_new_count == new_roots) { 1606 - qg->excl += num_bytes; 1607 - qg->excl_cmpr += num_bytes; 1608 - dirty = true; 1840 + /* Shared -> exclusive/none case */ 1841 + if (cur_old_count < nr_old_roots && 1842 + cur_new_count == nr_new_roots) { 1843 + /* Shared->exclusive */ 1844 + if (cur_new_count != 0) { 1845 + qg->excl += num_bytes; 1846 + qg->excl_cmpr += num_bytes; 1847 + dirty = true; 1848 + } 1609 1849 } 1610 1850 1851 + /* Exclusive/none -> exclusive/none case */ 1852 + if (cur_old_count == nr_old_roots && 1853 + cur_new_count == nr_new_roots) { 1854 + if (cur_old_count == 0) { 1855 + /* None -> exclusive/none */ 1856 + 1857 + if (cur_new_count != 0) { 1858 + /* None -> exclusive */ 1859 + qg->excl += num_bytes; 1860 + qg->excl_cmpr += num_bytes; 1861 + dirty = true; 1862 + } 1863 + /* None -> none, nothing changed */ 1864 + } else { 1865 + /* Exclusive -> exclusive/none */ 1866 + 1867 + if (cur_new_count == 0) { 1868 + /* Exclusive -> none */ 1869 + qg->excl -= num_bytes; 1870 + qg->excl_cmpr -= num_bytes; 1871 + dirty = true; 1872 + } 1873 + /* Exclusive -> exclusive, nothing changed */ 1874 + } 1875 + } 1611 1876 if (dirty) 1612 1877 qgroup_dirty(fs_info, qg); 1613 1878 } 1614 1879 return 0; 1615 1880 } 1616 1881 1617 - /* 1618 - * If we removed a data extent and there were other references for that bytenr 1619 - * then we need to lookup all referenced roots to make sure we still don't 1620 - * reference this bytenr. If we do then we can just discard this operation. 1621 - */ 1622 - static int check_existing_refs(struct btrfs_trans_handle *trans, 1623 - struct btrfs_fs_info *fs_info, 1624 - struct btrfs_qgroup_operation *oper) 1882 + int 1883 + btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 1884 + struct btrfs_fs_info *fs_info, 1885 + u64 bytenr, u64 num_bytes, 1886 + struct ulist *old_roots, struct ulist *new_roots) 1625 1887 { 1626 - struct ulist *roots = NULL; 1627 - struct ulist_node *unode; 1628 - struct ulist_iterator uiter; 1629 - int ret = 0; 1630 - 1631 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, 1632 - oper->elem.seq, &roots); 1633 - if (ret < 0) 1634 - return ret; 1635 - ret = 0; 1636 - 1637 - ULIST_ITER_INIT(&uiter); 1638 - while ((unode = ulist_next(roots, &uiter))) { 1639 - if (unode->val == oper->ref_root) { 1640 - ret = 1; 1641 - break; 1642 - } 1643 - } 1644 - ulist_free(roots); 1645 - btrfs_put_tree_mod_seq(fs_info, &oper->elem); 1646 - 1647 - return ret; 1648 - } 1649 - 1650 - /* 1651 - * If we share a reference across multiple roots then we may need to adjust 1652 - * various qgroups referenced and exclusive counters. The basic premise is this 1653 - * 1654 - * 1) We have seq to represent a 0 count. Instead of looping through all of the 1655 - * qgroups and resetting their refcount to 0 we just constantly bump this 1656 - * sequence number to act as the base reference count. This means that if 1657 - * anybody is equal to or below this sequence they were never referenced. We 1658 - * jack this sequence up by the number of roots we found each time in order to 1659 - * make sure we don't have any overlap. 1660 - * 1661 - * 2) We first search all the roots that reference the area _except_ the root 1662 - * we're acting on currently. This makes up the old_refcnt of all the qgroups 1663 - * before. 1664 - * 1665 - * 3) We walk all of the qgroups referenced by the root we are currently acting 1666 - * on, and will either adjust old_refcnt in the case of a removal or the 1667 - * new_refcnt in the case of an addition. 1668 - * 1669 - * 4) Finally we walk all the qgroups that are referenced by this range 1670 - * including the root we are acting on currently. We will adjust the counters 1671 - * based on the number of roots we had and will have after this operation. 1672 - * 1673 - * Take this example as an illustration 1674 - * 1675 - * [qgroup 1/0] 1676 - * / | \ 1677 - * [qg 0/0] [qg 0/1] [qg 0/2] 1678 - * \ | / 1679 - * [ extent ] 1680 - * 1681 - * Say we are adding a reference that is covered by qg 0/0. The first step 1682 - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with 1683 - * old_roots being 2. Because it is adding new_roots will be 1. We then go 1684 - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's 1685 - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we 1686 - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a 1687 - * reference and thus must add the size to the referenced bytes. Everything 1688 - * else is the same so nothing else changes. 1689 - */ 1690 - static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, 1691 - struct btrfs_fs_info *fs_info, 1692 - struct btrfs_qgroup_operation *oper) 1693 - { 1694 - struct ulist *roots = NULL; 1695 - struct ulist *qgroups, *tmp; 1696 - struct btrfs_qgroup *qgroup; 1697 - struct seq_list elem = SEQ_LIST_INIT(elem); 1888 + struct ulist *qgroups = NULL; 1889 + struct ulist *tmp = NULL; 1698 1890 u64 seq; 1699 - int old_roots = 0; 1700 - int new_roots = 0; 1891 + u64 nr_new_roots = 0; 1892 + u64 nr_old_roots = 0; 1701 1893 int ret = 0; 1702 1894 1703 - if (oper->elem.seq) { 1704 - ret = check_existing_refs(trans, fs_info, oper); 1705 - if (ret < 0) 1706 - return ret; 1707 - if (ret) 1708 - return 0; 1709 - } 1710 - 1711 - qgroups = ulist_alloc(GFP_NOFS); 1712 - if (!qgroups) 1713 - return -ENOMEM; 1714 - 1715 - tmp = ulist_alloc(GFP_NOFS); 1716 - if (!tmp) { 1717 - ulist_free(qgroups); 1718 - return -ENOMEM; 1719 - } 1720 - 1721 - btrfs_get_tree_mod_seq(fs_info, &elem); 1722 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, 1723 - &roots); 1724 - btrfs_put_tree_mod_seq(fs_info, &elem); 1725 - if (ret < 0) { 1726 - ulist_free(qgroups); 1727 - ulist_free(tmp); 1728 - return ret; 1729 - } 1730 - spin_lock(&fs_info->qgroup_lock); 1731 - qgroup = find_qgroup_rb(fs_info, oper->ref_root); 1732 - if (!qgroup) 1733 - goto out; 1734 - seq = fs_info->qgroup_seq; 1735 - 1736 - /* 1737 - * So roots is the list of all the roots currently pointing at the 1738 - * bytenr, including the ref we are adding if we are adding, or not if 1739 - * we are removing a ref. So we pass in the ref_root to skip that root 1740 - * in our calculations. We set old_refnct and new_refcnt cause who the 1741 - * hell knows what everything looked like before, and it doesn't matter 1742 - * except... 1743 - */ 1744 - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, 1745 - seq, &old_roots, 0); 1746 - if (ret < 0) 1747 - goto out; 1748 - 1749 - /* 1750 - * Now adjust the refcounts of the qgroups that care about this 1751 - * reference, either the old_count in the case of removal or new_count 1752 - * in the case of an addition. 1753 - */ 1754 - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, 1755 - seq); 1756 - if (ret < 0) 1757 - goto out; 1758 - 1759 - /* 1760 - * ...in the case of removals. If we had a removal before we got around 1761 - * to processing this operation then we need to find that guy and count 1762 - * his references as if they really existed so we don't end up screwing 1763 - * up the exclusive counts. Then whenever we go to process the delete 1764 - * everything will be grand and we can account for whatever exclusive 1765 - * changes need to be made there. We also have to pass in old_roots so 1766 - * we have an accurate count of the roots as it pertains to this 1767 - * operations view of the world. 1768 - */ 1769 - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, 1770 - &old_roots); 1771 - if (ret < 0) 1772 - goto out; 1773 - 1774 - /* 1775 - * We are adding our root, need to adjust up the number of roots, 1776 - * otherwise old_roots is the number of roots we want. 1777 - */ 1778 - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { 1779 - new_roots = old_roots + 1; 1780 - } else { 1781 - new_roots = old_roots; 1782 - old_roots++; 1783 - } 1784 - fs_info->qgroup_seq += old_roots + 1; 1785 - 1786 - 1787 - /* 1788 - * And now the magic happens, bless Arne for having a pretty elegant 1789 - * solution for this. 1790 - */ 1791 - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, 1792 - qgroups, seq, old_roots, new_roots, 0); 1793 - out: 1794 - spin_unlock(&fs_info->qgroup_lock); 1795 - ulist_free(qgroups); 1796 - ulist_free(roots); 1797 - ulist_free(tmp); 1798 - return ret; 1799 - } 1800 - 1801 - /* 1802 - * Process a reference to a shared subtree. This type of operation is 1803 - * queued during snapshot removal when we encounter extents which are 1804 - * shared between more than one root. 1805 - */ 1806 - static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, 1807 - struct btrfs_fs_info *fs_info, 1808 - struct btrfs_qgroup_operation *oper) 1809 - { 1810 - struct ulist *roots = NULL; 1811 - struct ulist_node *unode; 1812 - struct ulist_iterator uiter; 1813 - struct btrfs_qgroup_list *glist; 1814 - struct ulist *parents; 1815 - int ret = 0; 1816 - int err; 1817 - struct btrfs_qgroup *qg; 1818 - u64 root_obj = 0; 1819 - struct seq_list elem = SEQ_LIST_INIT(elem); 1820 - 1821 - parents = ulist_alloc(GFP_NOFS); 1822 - if (!parents) 1823 - return -ENOMEM; 1824 - 1825 - btrfs_get_tree_mod_seq(fs_info, &elem); 1826 - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, 1827 - elem.seq, &roots); 1828 - btrfs_put_tree_mod_seq(fs_info, &elem); 1829 - if (ret < 0) 1830 - goto out; 1831 - 1832 - if (roots->nnodes != 1) 1833 - goto out; 1834 - 1835 - ULIST_ITER_INIT(&uiter); 1836 - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ 1837 - /* 1838 - * If we find our ref root then that means all refs 1839 - * this extent has to the root have not yet been 1840 - * deleted. In that case, we do nothing and let the 1841 - * last ref for this bytenr drive our update. 1842 - * 1843 - * This can happen for example if an extent is 1844 - * referenced multiple times in a snapshot (clone, 1845 - * etc). If we are in the middle of snapshot removal, 1846 - * queued updates for such an extent will find the 1847 - * root if we have not yet finished removing the 1848 - * snapshot. 1849 - */ 1850 - if (unode->val == oper->ref_root) 1851 - goto out; 1852 - 1853 - root_obj = unode->val; 1854 - BUG_ON(!root_obj); 1855 - 1856 - spin_lock(&fs_info->qgroup_lock); 1857 - qg = find_qgroup_rb(fs_info, root_obj); 1858 - if (!qg) 1859 - goto out_unlock; 1860 - 1861 - qg->excl += oper->num_bytes; 1862 - qg->excl_cmpr += oper->num_bytes; 1863 - qgroup_dirty(fs_info, qg); 1864 - 1865 - /* 1866 - * Adjust counts for parent groups. First we find all 1867 - * parents, then in the 2nd loop we do the adjustment 1868 - * while adding parents of the parents to our ulist. 1869 - */ 1870 - list_for_each_entry(glist, &qg->groups, next_group) { 1871 - err = ulist_add(parents, glist->group->qgroupid, 1872 - ptr_to_u64(glist->group), GFP_ATOMIC); 1873 - if (err < 0) { 1874 - ret = err; 1875 - goto out_unlock; 1876 - } 1877 - } 1878 - 1879 - ULIST_ITER_INIT(&uiter); 1880 - while ((unode = ulist_next(parents, &uiter))) { 1881 - qg = u64_to_ptr(unode->aux); 1882 - qg->excl += oper->num_bytes; 1883 - qg->excl_cmpr += oper->num_bytes; 1884 - qgroup_dirty(fs_info, qg); 1885 - 1886 - /* Add any parents of the parents */ 1887 - list_for_each_entry(glist, &qg->groups, next_group) { 1888 - err = ulist_add(parents, glist->group->qgroupid, 1889 - ptr_to_u64(glist->group), GFP_ATOMIC); 1890 - if (err < 0) { 1891 - ret = err; 1892 - goto out_unlock; 1893 - } 1894 - } 1895 - } 1896 - 1897 - out_unlock: 1898 - spin_unlock(&fs_info->qgroup_lock); 1899 - 1900 - out: 1901 - ulist_free(roots); 1902 - ulist_free(parents); 1903 - return ret; 1904 - } 1905 - 1906 - /* 1907 - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 1908 - * from the fs. First, all roots referencing the extent are searched, and 1909 - * then the space is accounted accordingly to the different roots. The 1910 - * accounting algorithm works in 3 steps documented inline. 1911 - */ 1912 - static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, 1913 - struct btrfs_fs_info *fs_info, 1914 - struct btrfs_qgroup_operation *oper) 1915 - { 1916 - int ret = 0; 1895 + if (new_roots) 1896 + nr_new_roots = new_roots->nnodes; 1897 + if (old_roots) 1898 + nr_old_roots = old_roots->nnodes; 1917 1899 1918 1900 if (!fs_info->quota_enabled) 1919 - return 0; 1920 - 1901 + goto out_free; 1921 1902 BUG_ON(!fs_info->quota_root); 1903 + 1904 + qgroups = ulist_alloc(GFP_NOFS); 1905 + if (!qgroups) { 1906 + ret = -ENOMEM; 1907 + goto out_free; 1908 + } 1909 + tmp = ulist_alloc(GFP_NOFS); 1910 + if (!tmp) { 1911 + ret = -ENOMEM; 1912 + goto out_free; 1913 + } 1922 1914 1923 1915 mutex_lock(&fs_info->qgroup_rescan_lock); 1924 1916 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 1925 - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { 1917 + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 1926 1918 mutex_unlock(&fs_info->qgroup_rescan_lock); 1927 - return 0; 1919 + ret = 0; 1920 + goto out_free; 1928 1921 } 1929 1922 } 1930 1923 mutex_unlock(&fs_info->qgroup_rescan_lock); 1931 1924 1932 - ASSERT(is_fstree(oper->ref_root)); 1925 + spin_lock(&fs_info->qgroup_lock); 1926 + seq = fs_info->qgroup_seq; 1933 1927 1934 - trace_btrfs_qgroup_account(oper); 1928 + /* Update old refcnts using old_roots */ 1929 + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 1930 + UPDATE_OLD); 1931 + if (ret < 0) 1932 + goto out; 1935 1933 1936 - switch (oper->type) { 1937 - case BTRFS_QGROUP_OPER_ADD_EXCL: 1938 - case BTRFS_QGROUP_OPER_SUB_EXCL: 1939 - ret = qgroup_excl_accounting(fs_info, oper); 1940 - break; 1941 - case BTRFS_QGROUP_OPER_ADD_SHARED: 1942 - case BTRFS_QGROUP_OPER_SUB_SHARED: 1943 - ret = qgroup_shared_accounting(trans, fs_info, oper); 1944 - break; 1945 - case BTRFS_QGROUP_OPER_SUB_SUBTREE: 1946 - ret = qgroup_subtree_accounting(trans, fs_info, oper); 1947 - break; 1948 - default: 1949 - ASSERT(0); 1950 - } 1934 + /* Update new refcnts using new_roots */ 1935 + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 1936 + UPDATE_NEW); 1937 + if (ret < 0) 1938 + goto out; 1939 + 1940 + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 1941 + num_bytes, seq); 1942 + 1943 + /* 1944 + * Bump qgroup_seq to avoid seq overlap 1945 + */ 1946 + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 1947 + out: 1948 + spin_unlock(&fs_info->qgroup_lock); 1949 + out_free: 1950 + ulist_free(tmp); 1951 + ulist_free(qgroups); 1952 + ulist_free(old_roots); 1953 + ulist_free(new_roots); 1951 1954 return ret; 1952 1955 } 1953 1956 1954 - /* 1955 - * Needs to be called everytime we run delayed refs, even if there is an error 1956 - * in order to cleanup outstanding operations. 1957 - */ 1958 - int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, 1959 - struct btrfs_fs_info *fs_info) 1957 + int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, 1958 + struct btrfs_fs_info *fs_info) 1960 1959 { 1961 - struct btrfs_qgroup_operation *oper; 1960 + struct btrfs_qgroup_extent_record *record; 1961 + struct btrfs_delayed_ref_root *delayed_refs; 1962 + struct ulist *new_roots = NULL; 1963 + struct rb_node *node; 1964 + u64 qgroup_to_skip; 1962 1965 int ret = 0; 1963 1966 1964 - while (!list_empty(&trans->qgroup_ref_list)) { 1965 - oper = list_first_entry(&trans->qgroup_ref_list, 1966 - struct btrfs_qgroup_operation, list); 1967 - list_del_init(&oper->list); 1968 - if (!ret || !trans->aborted) 1969 - ret = btrfs_qgroup_account(trans, fs_info, oper); 1970 - spin_lock(&fs_info->qgroup_op_lock); 1971 - rb_erase(&oper->n, &fs_info->qgroup_op_tree); 1972 - spin_unlock(&fs_info->qgroup_op_lock); 1973 - btrfs_put_tree_mod_seq(fs_info, &oper->elem); 1974 - kfree(oper); 1967 + delayed_refs = &trans->transaction->delayed_refs; 1968 + qgroup_to_skip = delayed_refs->qgroup_to_skip; 1969 + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 1970 + record = rb_entry(node, struct btrfs_qgroup_extent_record, 1971 + node); 1972 + 1973 + if (!ret) { 1974 + /* 1975 + * Use (u64)-1 as time_seq to do special search, which 1976 + * doesn't lock tree or delayed_refs and search current 1977 + * root. It's safe inside commit_transaction(). 1978 + */ 1979 + ret = btrfs_find_all_roots(trans, fs_info, 1980 + record->bytenr, (u64)-1, &new_roots); 1981 + if (ret < 0) 1982 + goto cleanup; 1983 + if (qgroup_to_skip) 1984 + ulist_del(new_roots, qgroup_to_skip, 0); 1985 + ret = btrfs_qgroup_account_extent(trans, fs_info, 1986 + record->bytenr, record->num_bytes, 1987 + record->old_roots, new_roots); 1988 + record->old_roots = NULL; 1989 + new_roots = NULL; 1990 + } 1991 + cleanup: 1992 + ulist_free(record->old_roots); 1993 + ulist_free(new_roots); 1994 + new_roots = NULL; 1995 + rb_erase(node, &delayed_refs->dirty_extent_root); 1996 + kfree(record); 1997 + 1975 1998 } 1976 1999 return ret; 1977 2000 } ··· 2150 2637 */ 2151 2638 static int 2152 2639 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2153 - struct btrfs_trans_handle *trans, struct ulist *qgroups, 2154 - struct ulist *tmp, struct extent_buffer *scratch_leaf) 2640 + struct btrfs_trans_handle *trans, 2641 + struct extent_buffer *scratch_leaf) 2155 2642 { 2156 2643 struct btrfs_key found; 2157 2644 struct ulist *roots = NULL; 2158 2645 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); 2159 2646 u64 num_bytes; 2160 - u64 seq; 2161 - int new_roots; 2162 2647 int slot; 2163 2648 int ret; 2164 2649 ··· 2206 2695 else 2207 2696 num_bytes = found.offset; 2208 2697 2209 - ulist_reinit(qgroups); 2210 2698 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 2211 2699 &roots); 2212 2700 if (ret < 0) 2213 2701 goto out; 2214 - spin_lock(&fs_info->qgroup_lock); 2215 - seq = fs_info->qgroup_seq; 2216 - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 2217 - 2218 - new_roots = 0; 2219 - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, 2220 - seq, &new_roots, 1); 2221 - if (ret < 0) { 2222 - spin_unlock(&fs_info->qgroup_lock); 2223 - ulist_free(roots); 2702 + /* For rescan, just pass old_roots as NULL */ 2703 + ret = btrfs_qgroup_account_extent(trans, fs_info, 2704 + found.objectid, num_bytes, NULL, roots); 2705 + if (ret < 0) 2224 2706 goto out; 2225 - } 2226 - 2227 - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, 2228 - seq, 0, new_roots, 1); 2229 - if (ret < 0) { 2230 - spin_unlock(&fs_info->qgroup_lock); 2231 - ulist_free(roots); 2232 - goto out; 2233 - } 2234 - spin_unlock(&fs_info->qgroup_lock); 2235 - ulist_free(roots); 2236 2707 } 2237 2708 out: 2238 2709 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); ··· 2228 2735 qgroup_rescan_work); 2229 2736 struct btrfs_path *path; 2230 2737 struct btrfs_trans_handle *trans = NULL; 2231 - struct ulist *tmp = NULL, *qgroups = NULL; 2232 2738 struct extent_buffer *scratch_leaf = NULL; 2233 2739 int err = -ENOMEM; 2234 2740 int ret = 0; 2235 2741 2236 2742 path = btrfs_alloc_path(); 2237 2743 if (!path) 2238 - goto out; 2239 - qgroups = ulist_alloc(GFP_NOFS); 2240 - if (!qgroups) 2241 - goto out; 2242 - tmp = ulist_alloc(GFP_NOFS); 2243 - if (!tmp) 2244 2744 goto out; 2245 2745 scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); 2246 2746 if (!scratch_leaf) ··· 2250 2764 err = -EINTR; 2251 2765 } else { 2252 2766 err = qgroup_rescan_leaf(fs_info, path, trans, 2253 - qgroups, tmp, scratch_leaf); 2767 + scratch_leaf); 2254 2768 } 2255 2769 if (err > 0) 2256 2770 btrfs_commit_transaction(trans, fs_info->fs_root); ··· 2260 2774 2261 2775 out: 2262 2776 kfree(scratch_leaf); 2263 - ulist_free(qgroups); 2264 - ulist_free(tmp); 2265 2777 btrfs_free_path(path); 2266 2778 2267 2779 mutex_lock(&fs_info->qgroup_rescan_lock);
+20 -43
fs/btrfs/qgroup.h
··· 19 19 #ifndef __BTRFS_QGROUP__ 20 20 #define __BTRFS_QGROUP__ 21 21 22 - /* 23 - * A description of the operations, all of these operations only happen when we 24 - * are adding the 1st reference for that subvolume in the case of adding space 25 - * or on the last reference delete in the case of subtraction. The only 26 - * exception is the last one, which is added for confusion. 27 - * 28 - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only 29 - * one pointing at the bytes we are adding. This is called on the first 30 - * allocation. 31 - * 32 - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be 33 - * shared between subvols. This is called on the creation of a ref that already 34 - * has refs from a different subvolume, so basically reflink. 35 - * 36 - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only 37 - * one referencing the range. 38 - * 39 - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with 40 - * refs with other subvolumes. 41 - */ 42 - enum btrfs_qgroup_operation_type { 43 - BTRFS_QGROUP_OPER_ADD_EXCL, 44 - BTRFS_QGROUP_OPER_ADD_SHARED, 45 - BTRFS_QGROUP_OPER_SUB_EXCL, 46 - BTRFS_QGROUP_OPER_SUB_SHARED, 47 - BTRFS_QGROUP_OPER_SUB_SUBTREE, 48 - }; 22 + #include "ulist.h" 23 + #include "delayed-ref.h" 49 24 50 - struct btrfs_qgroup_operation { 51 - u64 ref_root; 25 + /* 26 + * Record a dirty extent, and info qgroup to update quota on it 27 + * TODO: Use kmem cache to alloc it. 28 + */ 29 + struct btrfs_qgroup_extent_record { 30 + struct rb_node node; 52 31 u64 bytenr; 53 32 u64 num_bytes; 54 - u64 seq; 55 - enum btrfs_qgroup_operation_type type; 56 - struct seq_list elem; 57 - struct rb_node n; 58 - struct list_head list; 33 + struct ulist *old_roots; 59 34 }; 60 35 61 36 int btrfs_quota_enable(struct btrfs_trans_handle *trans, ··· 54 79 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); 55 80 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); 56 81 struct btrfs_delayed_extent_op; 57 - int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, 58 - struct btrfs_fs_info *fs_info, u64 ref_root, 82 + int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 83 + struct btrfs_fs_info *fs_info); 84 + struct btrfs_qgroup_extent_record 85 + *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, 86 + struct btrfs_qgroup_extent_record *record); 87 + int 88 + btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 89 + struct btrfs_fs_info *fs_info, 59 90 u64 bytenr, u64 num_bytes, 60 - enum btrfs_qgroup_operation_type type, 61 - int mod_seq); 62 - int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, 63 - struct btrfs_fs_info *fs_info); 64 - void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, 65 - struct btrfs_fs_info *fs_info, 66 - struct btrfs_qgroup_operation *oper); 91 + struct ulist *old_roots, struct ulist *new_roots); 92 + int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, 93 + struct btrfs_fs_info *fs_info); 67 94 int btrfs_run_qgroups(struct btrfs_trans_handle *trans, 68 95 struct btrfs_fs_info *fs_info); 69 96 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+14 -5
fs/btrfs/relocation.c
··· 1847 1847 } 1848 1848 1849 1849 eb = read_tree_block(dest, old_bytenr, old_ptr_gen); 1850 - if (!eb || !extent_buffer_uptodate(eb)) { 1851 - ret = (!eb) ? -ENOMEM : -EIO; 1850 + if (IS_ERR(eb)) { 1851 + ret = PTR_ERR(eb); 1852 + } else if (!extent_buffer_uptodate(eb)) { 1853 + ret = -EIO; 1852 1854 free_extent_buffer(eb); 1853 1855 break; 1854 1856 } ··· 2004 2002 2005 2003 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2006 2004 eb = read_tree_block(root, bytenr, ptr_gen); 2007 - if (!eb || !extent_buffer_uptodate(eb)) { 2005 + if (IS_ERR(eb)) { 2006 + return PTR_ERR(eb); 2007 + } else if (!extent_buffer_uptodate(eb)) { 2008 2008 free_extent_buffer(eb); 2009 2009 return -EIO; 2010 2010 } ··· 2714 2710 blocksize = root->nodesize; 2715 2711 generation = btrfs_node_ptr_generation(upper->eb, slot); 2716 2712 eb = read_tree_block(root, bytenr, generation); 2717 - if (!eb || !extent_buffer_uptodate(eb)) { 2713 + if (IS_ERR(eb)) { 2714 + err = PTR_ERR(eb); 2715 + goto next; 2716 + } else if (!extent_buffer_uptodate(eb)) { 2718 2717 free_extent_buffer(eb); 2719 2718 err = -EIO; 2720 2719 goto next; ··· 2880 2873 BUG_ON(block->key_ready); 2881 2874 eb = read_tree_block(rc->extent_root, block->bytenr, 2882 2875 block->key.offset); 2883 - if (!eb || !extent_buffer_uptodate(eb)) { 2876 + if (IS_ERR(eb)) { 2877 + return PTR_ERR(eb); 2878 + } else if (!extent_buffer_uptodate(eb)) { 2884 2879 free_extent_buffer(eb); 2885 2880 return -EIO; 2886 2881 }
+23 -3
fs/btrfs/scrub.c
··· 2662 2662 kfree(sparity); 2663 2663 } 2664 2664 2665 + static void scrub_parity_bio_endio_worker(struct btrfs_work *work) 2666 + { 2667 + struct scrub_parity *sparity = container_of(work, struct scrub_parity, 2668 + work); 2669 + struct scrub_ctx *sctx = sparity->sctx; 2670 + 2671 + scrub_free_parity(sparity); 2672 + scrub_pending_bio_dec(sctx); 2673 + } 2674 + 2665 2675 static void scrub_parity_bio_endio(struct bio *bio, int error) 2666 2676 { 2667 2677 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 2668 - struct scrub_ctx *sctx = sparity->sctx; 2669 2678 2670 2679 if (error) 2671 2680 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 2672 2681 sparity->nsectors); 2673 2682 2674 - scrub_free_parity(sparity); 2675 - scrub_pending_bio_dec(sctx); 2676 2683 bio_put(bio); 2684 + 2685 + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, 2686 + scrub_parity_bio_endio_worker, NULL, NULL); 2687 + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, 2688 + &sparity->work); 2677 2689 } 2678 2690 2679 2691 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) ··· 3601 3589 ret = -ENOMEM; 3602 3590 goto out; 3603 3591 } 3592 + fs_info->scrub_parity_workers = 3593 + btrfs_alloc_workqueue("btrfs-scrubparity", flags, 3594 + max_active, 2); 3595 + if (!fs_info->scrub_parity_workers) { 3596 + ret = -ENOMEM; 3597 + goto out; 3598 + } 3604 3599 } 3605 3600 ++fs_info->scrub_workers_refcnt; 3606 3601 out: ··· 3620 3601 btrfs_destroy_workqueue(fs_info->scrub_workers); 3621 3602 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 3622 3603 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 3604 + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); 3623 3605 } 3624 3606 WARN_ON(fs_info->scrub_workers_refcnt < 0); 3625 3607 }
+121 -26
fs/btrfs/send.c
··· 243 243 * after this directory is moved, we can try to rmdir the ino rmdir_ino. 244 244 */ 245 245 u64 rmdir_ino; 246 + bool orphanized; 246 247 }; 247 248 248 249 struct orphan_dir_info { ··· 1159 1158 /* may be truncated in case it's the last extent in a file */ 1160 1159 u64 extent_len; 1161 1160 1161 + /* data offset in the file extent item */ 1162 + u64 data_offset; 1163 + 1162 1164 /* Just to check for bugs in backref resolving */ 1163 1165 int found_itself; 1164 1166 }; ··· 1225 1221 if (ret < 0) 1226 1222 return ret; 1227 1223 1228 - if (offset + bctx->extent_len > i_size) 1224 + if (offset + bctx->data_offset + bctx->extent_len > i_size) 1229 1225 return 0; 1230 1226 1231 1227 /* ··· 1367 1363 backref_ctx->cur_offset = data_offset; 1368 1364 backref_ctx->found_itself = 0; 1369 1365 backref_ctx->extent_len = num_bytes; 1366 + /* 1367 + * For non-compressed extents iterate_extent_inodes() gives us extent 1368 + * offsets that already take into account the data offset, but not for 1369 + * compressed extents, since the offset is logical and not relative to 1370 + * the physical extent locations. We must take this into account to 1371 + * avoid sending clone offsets that go beyond the source file's size, 1372 + * which would result in the clone ioctl failing with -EINVAL on the 1373 + * receiving end. 1374 + */ 1375 + if (compressed == BTRFS_COMPRESS_NONE) 1376 + backref_ctx->data_offset = 0; 1377 + else 1378 + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); 1370 1379 1371 1380 /* 1372 1381 * The last extent of a file may be too large due to page alignment. ··· 1917 1900 goto out; 1918 1901 } 1919 1902 1920 - /* we know that it is or will be overwritten. check this now */ 1921 - if (ow_inode < sctx->send_progress) 1903 + /* 1904 + * We know that it is or will be overwritten. Check this now. 1905 + * The current inode being processed might have been the one that caused 1906 + * inode 'ino' to be orphanized, therefore ow_inode can actually be the 1907 + * same as sctx->send_progress. 1908 + */ 1909 + if (ow_inode <= sctx->send_progress) 1922 1910 ret = 1; 1923 1911 else 1924 1912 ret = 0; ··· 2245 2223 fs_path_reset(dest); 2246 2224 2247 2225 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2226 + struct waiting_dir_move *wdm; 2227 + 2248 2228 fs_path_reset(name); 2249 2229 2250 2230 if (is_waiting_for_rm(sctx, ino)) { ··· 2257 2233 break; 2258 2234 } 2259 2235 2260 - if (is_waiting_for_move(sctx, ino)) { 2236 + wdm = get_waiting_dir_move(sctx, ino); 2237 + if (wdm && wdm->orphanized) { 2238 + ret = gen_unique_name(sctx, ino, gen, name); 2239 + stop = 1; 2240 + } else if (wdm) { 2261 2241 ret = get_first_ref(sctx->parent_root, ino, 2262 2242 &parent_inode, &parent_gen, name); 2263 2243 } else { ··· 2356 2328 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2357 2329 le64_to_cpu(sctx->send_root->root_item.ctransid)); 2358 2330 if (parent_root) { 2359 - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2360 - sctx->parent_root->root_item.uuid); 2331 + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) 2332 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2333 + parent_root->root_item.received_uuid); 2334 + else 2335 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2336 + parent_root->root_item.uuid); 2361 2337 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2362 2338 le64_to_cpu(sctx->parent_root->root_item.ctransid)); 2363 2339 } ··· 2955 2923 return entry != NULL; 2956 2924 } 2957 2925 2958 - static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2926 + static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) 2959 2927 { 2960 2928 struct rb_node **p = &sctx->waiting_dir_moves.rb_node; 2961 2929 struct rb_node *parent = NULL; ··· 2966 2934 return -ENOMEM; 2967 2935 dm->ino = ino; 2968 2936 dm->rmdir_ino = 0; 2937 + dm->orphanized = orphanized; 2969 2938 2970 2939 while (*p) { 2971 2940 parent = *p; ··· 3063 3030 goto out; 3064 3031 } 3065 3032 3066 - ret = add_waiting_dir_move(sctx, pm->ino); 3033 + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); 3067 3034 if (ret) 3068 3035 goto out; 3069 3036 ··· 3386 3353 return ret; 3387 3354 } 3388 3355 3356 + /* 3357 + * Check if ino ino1 is an ancestor of inode ino2 in the given root. 3358 + * Return 1 if true, 0 if false and < 0 on error. 3359 + */ 3360 + static int is_ancestor(struct btrfs_root *root, 3361 + const u64 ino1, 3362 + const u64 ino1_gen, 3363 + const u64 ino2, 3364 + struct fs_path *fs_path) 3365 + { 3366 + u64 ino = ino2; 3367 + 3368 + while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3369 + int ret; 3370 + u64 parent; 3371 + u64 parent_gen; 3372 + 3373 + fs_path_reset(fs_path); 3374 + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); 3375 + if (ret < 0) { 3376 + if (ret == -ENOENT && ino == ino2) 3377 + ret = 0; 3378 + return ret; 3379 + } 3380 + if (parent == ino1) 3381 + return parent_gen == ino1_gen ? 1 : 0; 3382 + ino = parent; 3383 + } 3384 + return 0; 3385 + } 3386 + 3389 3387 static int wait_for_parent_move(struct send_ctx *sctx, 3390 - struct recorded_ref *parent_ref) 3388 + struct recorded_ref *parent_ref, 3389 + const bool is_orphan) 3391 3390 { 3392 3391 int ret = 0; 3393 3392 u64 ino = parent_ref->dir; ··· 3439 3374 * Our current directory inode may not yet be renamed/moved because some 3440 3375 * ancestor (immediate or not) has to be renamed/moved first. So find if 3441 3376 * such ancestor exists and make sure our own rename/move happens after 3442 - * that ancestor is processed. 3377 + * that ancestor is processed to avoid path build infinite loops (done 3378 + * at get_cur_path()). 3443 3379 */ 3444 3380 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3445 3381 if (is_waiting_for_move(sctx, ino)) { 3446 - ret = 1; 3382 + /* 3383 + * If the current inode is an ancestor of ino in the 3384 + * parent root, we need to delay the rename of the 3385 + * current inode, otherwise don't delayed the rename 3386 + * because we can end up with a circular dependency 3387 + * of renames, resulting in some directories never 3388 + * getting the respective rename operations issued in 3389 + * the send stream or getting into infinite path build 3390 + * loops. 3391 + */ 3392 + ret = is_ancestor(sctx->parent_root, 3393 + sctx->cur_ino, sctx->cur_inode_gen, 3394 + ino, path_before); 3447 3395 break; 3448 3396 } 3449 3397 ··· 3498 3420 ino, 3499 3421 &sctx->new_refs, 3500 3422 &sctx->deleted_refs, 3501 - false); 3423 + is_orphan); 3502 3424 if (!ret) 3503 3425 ret = 1; 3504 3426 } ··· 3667 3589 } 3668 3590 } 3669 3591 3592 + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && 3593 + can_rename) { 3594 + ret = wait_for_parent_move(sctx, cur, is_orphan); 3595 + if (ret < 0) 3596 + goto out; 3597 + if (ret == 1) { 3598 + can_rename = false; 3599 + *pending_move = 1; 3600 + } 3601 + } 3602 + 3670 3603 /* 3671 3604 * link/move the ref to the new place. If we have an orphan 3672 3605 * inode, move it and update valid_path. If not, link or move ··· 3698 3609 * dirs, we always have one new and one deleted 3699 3610 * ref. The deleted ref is ignored later. 3700 3611 */ 3701 - ret = wait_for_parent_move(sctx, cur); 3702 - if (ret < 0) 3703 - goto out; 3704 - if (ret) { 3705 - *pending_move = 1; 3706 - } else { 3707 - ret = send_rename(sctx, valid_path, 3708 - cur->full_path); 3709 - if (!ret) 3710 - ret = fs_path_copy(valid_path, 3711 - cur->full_path); 3712 - } 3612 + ret = send_rename(sctx, valid_path, 3613 + cur->full_path); 3614 + if (!ret) 3615 + ret = fs_path_copy(valid_path, 3616 + cur->full_path); 3713 3617 if (ret < 0) 3714 3618 goto out; 3715 3619 } else { ··· 4590 4508 if (ret < 0) 4591 4509 goto out; 4592 4510 4593 - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4594 - clone_root->root->root_item.uuid); 4511 + /* 4512 + * If the parent we're using has a received_uuid set then use that as 4513 + * our clone source as that is what we will look for when doing a 4514 + * receive. 4515 + * 4516 + * This covers the case that we create a snapshot off of a received 4517 + * subvolume and then use that as the parent and try to receive on a 4518 + * different host. 4519 + */ 4520 + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) 4521 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4522 + clone_root->root->root_item.received_uuid); 4523 + else 4524 + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4525 + clone_root->root->root_item.uuid); 4595 4526 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 4596 4527 le64_to_cpu(clone_root->root->root_item.ctransid)); 4597 4528 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+260 -149
fs/btrfs/super.c
··· 135 135 * __btrfs_std_error decodes expected errors from the caller and 136 136 * invokes the approciate error response. 137 137 */ 138 + __cold 138 139 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 139 140 unsigned int line, int errno, const char *fmt, ...) 140 141 { ··· 248 247 * We'll complete the cleanup in btrfs_end_transaction and 249 248 * btrfs_commit_transaction. 250 249 */ 250 + __cold 251 251 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, 252 252 struct btrfs_root *root, const char *function, 253 253 unsigned int line, int errno) 254 254 { 255 - /* 256 - * Report first abort since mount 257 - */ 258 - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, 259 - &root->fs_info->fs_state)) { 260 - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", 261 - errno); 262 - } 263 255 trans->aborted = errno; 264 256 /* Nothing used. The other threads that have joined this 265 257 * transaction may be able to continue. */ ··· 275 281 * __btrfs_panic decodes unexpected, fatal errors from the caller, 276 282 * issues an alert, and either panics or BUGs, depending on mount options. 277 283 */ 284 + __cold 278 285 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, 279 286 unsigned int line, int errno, const char *fmt, ...) 280 287 { ··· 836 841 return error; 837 842 } 838 843 839 - static struct dentry *get_default_root(struct super_block *sb, 840 - u64 subvol_objectid) 844 + static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, 845 + u64 subvol_objectid) 841 846 { 842 - struct btrfs_fs_info *fs_info = btrfs_sb(sb); 843 847 struct btrfs_root *root = fs_info->tree_root; 844 - struct btrfs_root *new_root; 848 + struct btrfs_root *fs_root; 849 + struct btrfs_root_ref *root_ref; 850 + struct btrfs_inode_ref *inode_ref; 851 + struct btrfs_key key; 852 + struct btrfs_path *path = NULL; 853 + char *name = NULL, *ptr; 854 + u64 dirid; 855 + int len; 856 + int ret; 857 + 858 + path = btrfs_alloc_path(); 859 + if (!path) { 860 + ret = -ENOMEM; 861 + goto err; 862 + } 863 + path->leave_spinning = 1; 864 + 865 + name = kmalloc(PATH_MAX, GFP_NOFS); 866 + if (!name) { 867 + ret = -ENOMEM; 868 + goto err; 869 + } 870 + ptr = name + PATH_MAX - 1; 871 + ptr[0] = '\0'; 872 + 873 + /* 874 + * Walk up the subvolume trees in the tree of tree roots by root 875 + * backrefs until we hit the top-level subvolume. 876 + */ 877 + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { 878 + key.objectid = subvol_objectid; 879 + key.type = BTRFS_ROOT_BACKREF_KEY; 880 + key.offset = (u64)-1; 881 + 882 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 883 + if (ret < 0) { 884 + goto err; 885 + } else if (ret > 0) { 886 + ret = btrfs_previous_item(root, path, subvol_objectid, 887 + BTRFS_ROOT_BACKREF_KEY); 888 + if (ret < 0) { 889 + goto err; 890 + } else if (ret > 0) { 891 + ret = -ENOENT; 892 + goto err; 893 + } 894 + } 895 + 896 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 897 + subvol_objectid = key.offset; 898 + 899 + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 900 + struct btrfs_root_ref); 901 + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); 902 + ptr -= len + 1; 903 + if (ptr < name) { 904 + ret = -ENAMETOOLONG; 905 + goto err; 906 + } 907 + read_extent_buffer(path->nodes[0], ptr + 1, 908 + (unsigned long)(root_ref + 1), len); 909 + ptr[0] = '/'; 910 + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); 911 + btrfs_release_path(path); 912 + 913 + key.objectid = subvol_objectid; 914 + key.type = BTRFS_ROOT_ITEM_KEY; 915 + key.offset = (u64)-1; 916 + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); 917 + if (IS_ERR(fs_root)) { 918 + ret = PTR_ERR(fs_root); 919 + goto err; 920 + } 921 + 922 + /* 923 + * Walk up the filesystem tree by inode refs until we hit the 924 + * root directory. 925 + */ 926 + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { 927 + key.objectid = dirid; 928 + key.type = BTRFS_INODE_REF_KEY; 929 + key.offset = (u64)-1; 930 + 931 + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 932 + if (ret < 0) { 933 + goto err; 934 + } else if (ret > 0) { 935 + ret = btrfs_previous_item(fs_root, path, dirid, 936 + BTRFS_INODE_REF_KEY); 937 + if (ret < 0) { 938 + goto err; 939 + } else if (ret > 0) { 940 + ret = -ENOENT; 941 + goto err; 942 + } 943 + } 944 + 945 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 946 + dirid = key.offset; 947 + 948 + inode_ref = btrfs_item_ptr(path->nodes[0], 949 + path->slots[0], 950 + struct btrfs_inode_ref); 951 + len = btrfs_inode_ref_name_len(path->nodes[0], 952 + inode_ref); 953 + ptr -= len + 1; 954 + if (ptr < name) { 955 + ret = -ENAMETOOLONG; 956 + goto err; 957 + } 958 + read_extent_buffer(path->nodes[0], ptr + 1, 959 + (unsigned long)(inode_ref + 1), len); 960 + ptr[0] = '/'; 961 + btrfs_release_path(path); 962 + } 963 + } 964 + 965 + btrfs_free_path(path); 966 + if (ptr == name + PATH_MAX - 1) { 967 + name[0] = '/'; 968 + name[1] = '\0'; 969 + } else { 970 + memmove(name, ptr, name + PATH_MAX - ptr); 971 + } 972 + return name; 973 + 974 + err: 975 + btrfs_free_path(path); 976 + kfree(name); 977 + return ERR_PTR(ret); 978 + } 979 + 980 + static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) 981 + { 982 + struct btrfs_root *root = fs_info->tree_root; 845 983 struct btrfs_dir_item *di; 846 984 struct btrfs_path *path; 847 985 struct btrfs_key location; 848 - struct inode *inode; 849 986 u64 dir_id; 850 - int new = 0; 851 - 852 - /* 853 - * We have a specific subvol we want to mount, just setup location and 854 - * go look up the root. 855 - */ 856 - if (subvol_objectid) { 857 - location.objectid = subvol_objectid; 858 - location.type = BTRFS_ROOT_ITEM_KEY; 859 - location.offset = (u64)-1; 860 - goto find_root; 861 - } 862 987 863 988 path = btrfs_alloc_path(); 864 989 if (!path) 865 - return ERR_PTR(-ENOMEM); 990 + return -ENOMEM; 866 991 path->leave_spinning = 1; 867 992 868 993 /* ··· 994 879 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 995 880 if (IS_ERR(di)) { 996 881 btrfs_free_path(path); 997 - return ERR_CAST(di); 882 + return PTR_ERR(di); 998 883 } 999 884 if (!di) { 1000 885 /* 1001 886 * Ok the default dir item isn't there. This is weird since 1002 887 * it's always been there, but don't freak out, just try and 1003 - * mount to root most subvolume. 888 + * mount the top-level subvolume. 1004 889 */ 1005 890 btrfs_free_path(path); 1006 - dir_id = BTRFS_FIRST_FREE_OBJECTID; 1007 - new_root = fs_info->fs_root; 1008 - goto setup_root; 891 + *objectid = BTRFS_FS_TREE_OBJECTID; 892 + return 0; 1009 893 } 1010 894 1011 895 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1012 896 btrfs_free_path(path); 1013 - 1014 - find_root: 1015 - new_root = btrfs_read_fs_root_no_name(fs_info, &location); 1016 - if (IS_ERR(new_root)) 1017 - return ERR_CAST(new_root); 1018 - 1019 - if (!(sb->s_flags & MS_RDONLY)) { 1020 - int ret; 1021 - down_read(&fs_info->cleanup_work_sem); 1022 - ret = btrfs_orphan_cleanup(new_root); 1023 - up_read(&fs_info->cleanup_work_sem); 1024 - if (ret) 1025 - return ERR_PTR(ret); 1026 - } 1027 - 1028 - dir_id = btrfs_root_dirid(&new_root->root_item); 1029 - setup_root: 1030 - location.objectid = dir_id; 1031 - location.type = BTRFS_INODE_ITEM_KEY; 1032 - location.offset = 0; 1033 - 1034 - inode = btrfs_iget(sb, &location, new_root, &new); 1035 - if (IS_ERR(inode)) 1036 - return ERR_CAST(inode); 1037 - 1038 - /* 1039 - * If we're just mounting the root most subvol put the inode and return 1040 - * a reference to the dentry. We will have already gotten a reference 1041 - * to the inode in btrfs_fill_super so we're good to go. 1042 - */ 1043 - if (!new && d_inode(sb->s_root) == inode) { 1044 - iput(inode); 1045 - return dget(sb->s_root); 1046 - } 1047 - 1048 - return d_obtain_root(inode); 897 + *objectid = location.objectid; 898 + return 0; 1049 899 } 1050 900 1051 901 static int btrfs_fill_super(struct super_block *sb, ··· 1188 1108 seq_puts(seq, ",fatal_errors=panic"); 1189 1109 if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) 1190 1110 seq_printf(seq, ",commit=%d", info->commit_interval); 1111 + seq_printf(seq, ",subvolid=%llu", 1112 + BTRFS_I(d_inode(dentry))->root->root_key.objectid); 1113 + seq_puts(seq, ",subvol="); 1114 + seq_dentry(seq, dentry, " \t\n\\"); 1191 1115 return 0; 1192 1116 } 1193 1117 ··· 1222 1138 } 1223 1139 1224 1140 /* 1225 - * This will strip out the subvol=%s argument for an argument string and add 1226 - * subvolid=0 to make sure we get the actual tree root for path walking to the 1227 - * subvol we want. 1141 + * This will add subvolid=0 to the argument string while removing any subvol= 1142 + * and subvolid= arguments to make sure we get the top-level root for path 1143 + * walking to the subvol we want. 1228 1144 */ 1229 1145 static char *setup_root_args(char *args) 1230 1146 { 1231 - unsigned len = strlen(args) + 2 + 1; 1232 - char *src, *dst, *buf; 1147 + char *buf, *dst, *sep; 1233 1148 1234 - /* 1235 - * We need the same args as before, but with this substitution: 1236 - * s!subvol=[^,]+!subvolid=0! 1237 - * 1238 - * Since the replacement string is up to 2 bytes longer than the 1239 - * original, allocate strlen(args) + 2 + 1 bytes. 1240 - */ 1149 + if (!args) 1150 + return kstrdup("subvolid=0", GFP_NOFS); 1241 1151 1242 - src = strstr(args, "subvol="); 1243 - /* This shouldn't happen, but just in case.. */ 1244 - if (!src) 1245 - return NULL; 1246 - 1247 - buf = dst = kmalloc(len, GFP_NOFS); 1152 + /* The worst case is that we add ",subvolid=0" to the end. */ 1153 + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); 1248 1154 if (!buf) 1249 1155 return NULL; 1250 1156 1251 - /* 1252 - * If the subvol= arg is not at the start of the string, 1253 - * copy whatever precedes it into buf. 1254 - */ 1255 - if (src != args) { 1256 - *src++ = '\0'; 1257 - strcpy(buf, args); 1258 - dst += strlen(args); 1157 + while (1) { 1158 + sep = strchrnul(args, ','); 1159 + if (!strstarts(args, "subvol=") && 1160 + !strstarts(args, "subvolid=")) { 1161 + memcpy(dst, args, sep - args); 1162 + dst += sep - args; 1163 + *dst++ = ','; 1164 + } 1165 + if (*sep) 1166 + args = sep + 1; 1167 + else 1168 + break; 1259 1169 } 1260 - 1261 1170 strcpy(dst, "subvolid=0"); 1262 - dst += strlen("subvolid=0"); 1263 - 1264 - /* 1265 - * If there is a "," after the original subvol=... string, 1266 - * copy that suffix into our buffer. Otherwise, we're done. 1267 - */ 1268 - src = strchr(src, ','); 1269 - if (src) 1270 - strcpy(dst, src); 1271 1171 1272 1172 return buf; 1273 1173 } 1274 1174 1275 - static struct dentry *mount_subvol(const char *subvol_name, int flags, 1276 - const char *device_name, char *data) 1175 + static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, 1176 + int flags, const char *device_name, 1177 + char *data) 1277 1178 { 1278 1179 struct dentry *root; 1279 - struct vfsmount *mnt; 1180 + struct vfsmount *mnt = NULL; 1280 1181 char *newargs; 1182 + int ret; 1281 1183 1282 1184 newargs = setup_root_args(data); 1283 - if (!newargs) 1284 - return ERR_PTR(-ENOMEM); 1285 - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, 1286 - newargs); 1185 + if (!newargs) { 1186 + root = ERR_PTR(-ENOMEM); 1187 + goto out; 1188 + } 1287 1189 1288 - if (PTR_RET(mnt) == -EBUSY) { 1190 + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); 1191 + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { 1289 1192 if (flags & MS_RDONLY) { 1290 - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, 1291 - newargs); 1193 + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, 1194 + device_name, newargs); 1292 1195 } else { 1293 - int r; 1294 - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, 1295 - newargs); 1196 + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, 1197 + device_name, newargs); 1296 1198 if (IS_ERR(mnt)) { 1297 - kfree(newargs); 1298 - return ERR_CAST(mnt); 1199 + root = ERR_CAST(mnt); 1200 + mnt = NULL; 1201 + goto out; 1299 1202 } 1300 1203 1301 - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); 1302 - if (r < 0) { 1303 - /* FIXME: release vfsmount mnt ??*/ 1304 - kfree(newargs); 1305 - return ERR_PTR(r); 1204 + down_write(&mnt->mnt_sb->s_umount); 1205 + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); 1206 + up_write(&mnt->mnt_sb->s_umount); 1207 + if (ret < 0) { 1208 + root = ERR_PTR(ret); 1209 + goto out; 1306 1210 } 1307 1211 } 1308 1212 } 1309 - 1310 - kfree(newargs); 1311 - 1312 - if (IS_ERR(mnt)) 1313 - return ERR_CAST(mnt); 1314 - 1315 - root = mount_subtree(mnt, subvol_name); 1316 - 1317 - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { 1318 - struct super_block *s = root->d_sb; 1319 - dput(root); 1320 - root = ERR_PTR(-EINVAL); 1321 - deactivate_locked_super(s); 1322 - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", 1323 - subvol_name); 1213 + if (IS_ERR(mnt)) { 1214 + root = ERR_CAST(mnt); 1215 + mnt = NULL; 1216 + goto out; 1324 1217 } 1325 1218 1219 + if (!subvol_name) { 1220 + if (!subvol_objectid) { 1221 + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), 1222 + &subvol_objectid); 1223 + if (ret) { 1224 + root = ERR_PTR(ret); 1225 + goto out; 1226 + } 1227 + } 1228 + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), 1229 + subvol_objectid); 1230 + if (IS_ERR(subvol_name)) { 1231 + root = ERR_CAST(subvol_name); 1232 + subvol_name = NULL; 1233 + goto out; 1234 + } 1235 + 1236 + } 1237 + 1238 + root = mount_subtree(mnt, subvol_name); 1239 + /* mount_subtree() drops our reference on the vfsmount. */ 1240 + mnt = NULL; 1241 + 1242 + if (!IS_ERR(root)) { 1243 + struct super_block *s = root->d_sb; 1244 + struct inode *root_inode = d_inode(root); 1245 + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; 1246 + 1247 + ret = 0; 1248 + if (!is_subvolume_inode(root_inode)) { 1249 + pr_err("BTRFS: '%s' is not a valid subvolume\n", 1250 + subvol_name); 1251 + ret = -EINVAL; 1252 + } 1253 + if (subvol_objectid && root_objectid != subvol_objectid) { 1254 + /* 1255 + * This will also catch a race condition where a 1256 + * subvolume which was passed by ID is renamed and 1257 + * another subvolume is renamed over the old location. 1258 + */ 1259 + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", 1260 + subvol_name, subvol_objectid); 1261 + ret = -EINVAL; 1262 + } 1263 + if (ret) { 1264 + dput(root); 1265 + root = ERR_PTR(ret); 1266 + deactivate_locked_super(s); 1267 + } 1268 + } 1269 + 1270 + out: 1271 + mntput(mnt); 1272 + kfree(newargs); 1273 + kfree(subvol_name); 1326 1274 return root; 1327 1275 } 1328 1276 ··· 1419 1303 { 1420 1304 struct block_device *bdev = NULL; 1421 1305 struct super_block *s; 1422 - struct dentry *root; 1423 1306 struct btrfs_fs_devices *fs_devices = NULL; 1424 1307 struct btrfs_fs_info *fs_info = NULL; 1425 1308 struct security_mnt_opts new_sec_opts; ··· 1438 1323 return ERR_PTR(error); 1439 1324 } 1440 1325 1441 - if (subvol_name) { 1442 - root = mount_subvol(subvol_name, flags, device_name, data); 1443 - kfree(subvol_name); 1444 - return root; 1326 + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { 1327 + /* mount_subvol() will free subvol_name. */ 1328 + return mount_subvol(subvol_name, subvol_objectid, flags, 1329 + device_name, data); 1445 1330 } 1446 1331 1447 1332 security_init_mnt_opts(&new_sec_opts); ··· 1507 1392 error = btrfs_fill_super(s, fs_devices, data, 1508 1393 flags & MS_SILENT ? 1 : 0); 1509 1394 } 1510 - 1511 - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); 1512 - if (IS_ERR(root)) { 1395 + if (error) { 1513 1396 deactivate_locked_super(s); 1514 - error = PTR_ERR(root); 1515 1397 goto error_sec_opts; 1516 1398 } 1517 1399 1518 1400 fs_info = btrfs_sb(s); 1519 1401 error = setup_security_options(fs_info, s, &new_sec_opts); 1520 1402 if (error) { 1521 - dput(root); 1522 1403 deactivate_locked_super(s); 1523 1404 goto error_sec_opts; 1524 1405 } 1525 1406 1526 - return root; 1407 + return dget(s->s_root); 1527 1408 1528 1409 error_close_devices: 1529 1410 btrfs_close_devices(fs_devices);
+112 -42
fs/btrfs/sysfs.c
··· 33 33 #include "volumes.h" 34 34 35 35 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); 36 + static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); 36 37 37 38 static u64 get_features(struct btrfs_fs_info *fs_info, 38 39 enum btrfs_feature_set set) ··· 429 428 430 429 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); 431 430 432 - static struct attribute *btrfs_attrs[] = { 431 + static const struct attribute *btrfs_attrs[] = { 433 432 BTRFS_ATTR_PTR(label), 434 433 BTRFS_ATTR_PTR(nodesize), 435 434 BTRFS_ATTR_PTR(sectorsize), ··· 439 438 440 439 static void btrfs_release_super_kobj(struct kobject *kobj) 441 440 { 442 - struct btrfs_fs_info *fs_info = to_fs_info(kobj); 443 - complete(&fs_info->kobj_unregister); 441 + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); 442 + 443 + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); 444 + complete(&fs_devs->kobj_unregister); 444 445 } 445 446 446 447 static struct kobj_type btrfs_ktype = { 447 448 .sysfs_ops = &kobj_sysfs_ops, 448 449 .release = btrfs_release_super_kobj, 449 - .default_attrs = btrfs_attrs, 450 450 }; 451 + 452 + static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) 453 + { 454 + if (kobj->ktype != &btrfs_ktype) 455 + return NULL; 456 + return container_of(kobj, struct btrfs_fs_devices, super_kobj); 457 + } 451 458 452 459 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) 453 460 { 454 461 if (kobj->ktype != &btrfs_ktype) 455 462 return NULL; 456 - return container_of(kobj, struct btrfs_fs_info, super_kobj); 463 + return to_fs_devs(kobj)->fs_info; 457 464 } 458 465 459 466 #define NUM_FEATURE_BITS 64 ··· 502 493 attrs[0] = &fa->kobj_attr.attr; 503 494 if (add) { 504 495 int ret; 505 - ret = sysfs_merge_group(&fs_info->super_kobj, 496 + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, 506 497 &agroup); 507 498 if (ret) 508 499 return ret; 509 500 } else 510 - sysfs_unmerge_group(&fs_info->super_kobj, 501 + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, 511 502 &agroup); 512 503 } 513 504 ··· 515 506 return 0; 516 507 } 517 508 518 - static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) 509 + static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) 519 510 { 520 - kobject_del(&fs_info->super_kobj); 521 - kobject_put(&fs_info->super_kobj); 522 - wait_for_completion(&fs_info->kobj_unregister); 511 + if (fs_devs->device_dir_kobj) { 512 + kobject_del(fs_devs->device_dir_kobj); 513 + kobject_put(fs_devs->device_dir_kobj); 514 + fs_devs->device_dir_kobj = NULL; 515 + } 516 + 517 + if (fs_devs->super_kobj.state_initialized) { 518 + kobject_del(&fs_devs->super_kobj); 519 + kobject_put(&fs_devs->super_kobj); 520 + wait_for_completion(&fs_devs->kobj_unregister); 521 + } 522 + } 523 + 524 + /* when fs_devs is NULL it will remove all fsid kobject */ 525 + void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) 526 + { 527 + struct list_head *fs_uuids = btrfs_get_fs_uuids(); 528 + 529 + if (fs_devs) { 530 + __btrfs_sysfs_remove_fsid(fs_devs); 531 + return; 532 + } 533 + 534 + list_for_each_entry(fs_devs, fs_uuids, list) { 535 + __btrfs_sysfs_remove_fsid(fs_devs); 536 + } 523 537 } 524 538 525 539 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) 526 540 { 541 + btrfs_reset_fs_info_ptr(fs_info); 542 + 527 543 if (fs_info->space_info_kobj) { 528 544 sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); 529 545 kobject_del(fs_info->space_info_kobj); 530 546 kobject_put(fs_info->space_info_kobj); 531 547 } 532 - kobject_del(fs_info->device_dir_kobj); 533 - kobject_put(fs_info->device_dir_kobj); 534 548 addrm_unknown_feature_attrs(fs_info, false); 535 - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); 536 - __btrfs_sysfs_remove_one(fs_info); 549 + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); 550 + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); 551 + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); 537 552 } 538 553 539 554 const char * const btrfs_feature_set_names[3] = { ··· 635 602 } 636 603 } 637 604 638 - int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, 605 + /* when one_device is NULL, it removes all device links */ 606 + 607 + int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 639 608 struct btrfs_device *one_device) 640 609 { 641 610 struct hd_struct *disk; 642 611 struct kobject *disk_kobj; 643 612 644 - if (!fs_info->device_dir_kobj) 613 + if (!fs_devices->device_dir_kobj) 645 614 return -EINVAL; 646 615 647 616 if (one_device && one_device->bdev) { 648 617 disk = one_device->bdev->bd_part; 649 618 disk_kobj = &part_to_dev(disk)->kobj; 650 619 651 - sysfs_remove_link(fs_info->device_dir_kobj, 620 + sysfs_remove_link(fs_devices->device_dir_kobj, 621 + disk_kobj->name); 622 + } 623 + 624 + if (one_device) 625 + return 0; 626 + 627 + list_for_each_entry(one_device, 628 + &fs_devices->devices, dev_list) { 629 + if (!one_device->bdev) 630 + continue; 631 + disk = one_device->bdev->bd_part; 632 + disk_kobj = &part_to_dev(disk)->kobj; 633 + 634 + sysfs_remove_link(fs_devices->device_dir_kobj, 652 635 disk_kobj->name); 653 636 } 654 637 655 638 return 0; 656 639 } 657 640 658 - int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, 659 - struct btrfs_device *one_device) 641 + int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) 642 + { 643 + if (!fs_devs->device_dir_kobj) 644 + fs_devs->device_dir_kobj = kobject_create_and_add("devices", 645 + &fs_devs->super_kobj); 646 + 647 + if (!fs_devs->device_dir_kobj) 648 + return -ENOMEM; 649 + 650 + return 0; 651 + } 652 + 653 + int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 654 + struct btrfs_device *one_device) 660 655 { 661 656 int error = 0; 662 - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 663 657 struct btrfs_device *dev; 664 - 665 - if (!fs_info->device_dir_kobj) 666 - fs_info->device_dir_kobj = kobject_create_and_add("devices", 667 - &fs_info->super_kobj); 668 - 669 - if (!fs_info->device_dir_kobj) 670 - return -ENOMEM; 671 658 672 659 list_for_each_entry(dev, &fs_devices->devices, dev_list) { 673 660 struct hd_struct *disk; ··· 702 649 disk = dev->bdev->bd_part; 703 650 disk_kobj = &part_to_dev(disk)->kobj; 704 651 705 - error = sysfs_create_link(fs_info->device_dir_kobj, 652 + error = sysfs_create_link(fs_devices->device_dir_kobj, 706 653 disk_kobj, disk_kobj->name); 707 654 if (error) 708 655 break; ··· 720 667 /* Debugging tunables and exported data */ 721 668 u64 btrfs_debugfs_test; 722 669 723 - int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 670 + /* 671 + * Can be called by the device discovery thread. 672 + * And parent can be specified for seed device 673 + */ 674 + int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, 675 + struct kobject *parent) 724 676 { 725 677 int error; 726 678 727 - init_completion(&fs_info->kobj_unregister); 728 - fs_info->super_kobj.kset = btrfs_kset; 729 - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, 730 - "%pU", fs_info->fsid); 679 + init_completion(&fs_devs->kobj_unregister); 680 + fs_devs->super_kobj.kset = btrfs_kset; 681 + error = kobject_init_and_add(&fs_devs->super_kobj, 682 + &btrfs_ktype, parent, "%pU", fs_devs->fsid); 683 + return error; 684 + } 685 + 686 + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 687 + { 688 + int error; 689 + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; 690 + struct kobject *super_kobj = &fs_devs->super_kobj; 691 + 692 + btrfs_set_fs_info_ptr(fs_info); 693 + 694 + error = btrfs_kobj_add_device(fs_devs, NULL); 731 695 if (error) 732 696 return error; 733 697 734 - error = sysfs_create_group(&fs_info->super_kobj, 735 - &btrfs_feature_attr_group); 698 + error = sysfs_create_files(super_kobj, btrfs_attrs); 736 699 if (error) { 737 - __btrfs_sysfs_remove_one(fs_info); 700 + btrfs_kobj_rm_device(fs_devs, NULL); 738 701 return error; 739 702 } 703 + 704 + error = sysfs_create_group(super_kobj, 705 + &btrfs_feature_attr_group); 706 + if (error) 707 + goto failure; 740 708 741 709 error = addrm_unknown_feature_attrs(fs_info, true); 742 710 if (error) 743 711 goto failure; 744 712 745 - error = btrfs_kobj_add_device(fs_info, NULL); 746 - if (error) 747 - goto failure; 748 - 749 713 fs_info->space_info_kobj = kobject_create_and_add("allocation", 750 - &fs_info->super_kobj); 714 + super_kobj); 751 715 if (!fs_info->space_info_kobj) { 752 716 error = -ENOMEM; 753 717 goto failure;
+6 -2
fs/btrfs/sysfs.h
··· 82 82 extern const char * const btrfs_feature_set_names[3]; 83 83 extern struct kobj_type space_info_ktype; 84 84 extern struct kobj_type btrfs_raid_ktype; 85 - int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, 85 + int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 86 86 struct btrfs_device *one_device); 87 - int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, 87 + int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 88 88 struct btrfs_device *one_device); 89 + int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, 90 + struct kobject *parent); 91 + int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); 92 + void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); 89 93 #endif /* _BTRFS_SYSFS_H_ */
+83 -26
fs/btrfs/tests/qgroup-tests.c
··· 21 21 #include "../transaction.h" 22 22 #include "../disk-io.h" 23 23 #include "../qgroup.h" 24 + #include "../backref.h" 24 25 25 26 static void init_dummy_trans(struct btrfs_trans_handle *trans) 26 27 { ··· 228 227 { 229 228 struct btrfs_trans_handle trans; 230 229 struct btrfs_fs_info *fs_info = root->fs_info; 230 + struct ulist *old_roots = NULL; 231 + struct ulist *new_roots = NULL; 231 232 int ret; 232 233 233 234 init_dummy_trans(&trans); ··· 241 238 return ret; 242 239 } 243 240 244 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 245 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 241 + /* 242 + * Since the test trans doesn't havee the complicated delayed refs, 243 + * we can only call btrfs_qgroup_account_extent() directly to test 244 + * quota. 245 + */ 246 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 246 247 if (ret) { 247 - test_msg("Couldn't add space to a qgroup %d\n", ret); 248 + ulist_free(old_roots); 249 + test_msg("Couldn't find old roots: %d\n", ret); 248 250 return ret; 249 251 } 250 252 ··· 257 249 if (ret) 258 250 return ret; 259 251 260 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 252 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 261 253 if (ret) { 262 - test_msg("Delayed qgroup accounting failed %d\n", ret); 254 + ulist_free(old_roots); 255 + ulist_free(new_roots); 256 + test_msg("Couldn't find old roots: %d\n", ret); 257 + return ret; 258 + } 259 + 260 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 261 + old_roots, new_roots); 262 + if (ret) { 263 + test_msg("Couldn't account space for a qgroup %d\n", ret); 263 264 return ret; 264 265 } 265 266 ··· 276 259 test_msg("Qgroup counts didn't match expected values\n"); 277 260 return -EINVAL; 278 261 } 262 + old_roots = NULL; 263 + new_roots = NULL; 264 + 265 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 266 + if (ret) { 267 + ulist_free(old_roots); 268 + test_msg("Couldn't find old roots: %d\n", ret); 269 + return ret; 270 + } 279 271 280 272 ret = remove_extent_item(root, 4096, 4096); 281 273 if (ret) 282 274 return -EINVAL; 283 275 284 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 285 - BTRFS_QGROUP_OPER_SUB_EXCL, 0); 276 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 286 277 if (ret) { 287 - test_msg("Couldn't remove space from the qgroup %d\n", ret); 288 - return -EINVAL; 278 + ulist_free(old_roots); 279 + ulist_free(new_roots); 280 + test_msg("Couldn't find old roots: %d\n", ret); 281 + return ret; 289 282 } 290 283 291 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 284 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 285 + old_roots, new_roots); 292 286 if (ret) { 293 - test_msg("Qgroup accounting failed %d\n", ret); 287 + test_msg("Couldn't account space for a qgroup %d\n", ret); 294 288 return -EINVAL; 295 289 } 296 290 ··· 322 294 { 323 295 struct btrfs_trans_handle trans; 324 296 struct btrfs_fs_info *fs_info = root->fs_info; 297 + struct ulist *old_roots = NULL; 298 + struct ulist *new_roots = NULL; 325 299 int ret; 326 300 327 301 init_dummy_trans(&trans); ··· 337 307 return ret; 338 308 } 339 309 310 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 311 + if (ret) { 312 + ulist_free(old_roots); 313 + test_msg("Couldn't find old roots: %d\n", ret); 314 + return ret; 315 + } 316 + 340 317 ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); 341 318 if (ret) 342 319 return ret; 343 320 344 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, 345 - BTRFS_QGROUP_OPER_ADD_EXCL, 0); 321 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 346 322 if (ret) { 347 - test_msg("Couldn't add space to a qgroup %d\n", ret); 323 + ulist_free(old_roots); 324 + ulist_free(new_roots); 325 + test_msg("Couldn't find old roots: %d\n", ret); 348 326 return ret; 349 327 } 350 328 351 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 329 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 330 + old_roots, new_roots); 352 331 if (ret) { 353 - test_msg("Delayed qgroup accounting failed %d\n", ret); 332 + test_msg("Couldn't account space for a qgroup %d\n", ret); 354 333 return ret; 355 334 } 356 335 ··· 368 329 return -EINVAL; 369 330 } 370 331 332 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 333 + if (ret) { 334 + ulist_free(old_roots); 335 + test_msg("Couldn't find old roots: %d\n", ret); 336 + return ret; 337 + } 338 + 371 339 ret = add_tree_ref(root, 4096, 4096, 0, 256); 372 340 if (ret) 373 341 return ret; 374 342 375 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, 376 - BTRFS_QGROUP_OPER_ADD_SHARED, 0); 343 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 377 344 if (ret) { 378 - test_msg("Qgroup record ref failed %d\n", ret); 345 + ulist_free(old_roots); 346 + ulist_free(new_roots); 347 + test_msg("Couldn't find old roots: %d\n", ret); 379 348 return ret; 380 349 } 381 350 382 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 351 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 352 + old_roots, new_roots); 383 353 if (ret) { 384 - test_msg("Qgroup accounting failed %d\n", ret); 354 + test_msg("Couldn't account space for a qgroup %d\n", ret); 385 355 return ret; 386 356 } 387 357 ··· 404 356 return -EINVAL; 405 357 } 406 358 359 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); 360 + if (ret) { 361 + ulist_free(old_roots); 362 + test_msg("Couldn't find old roots: %d\n", ret); 363 + return ret; 364 + } 365 + 407 366 ret = remove_extent_ref(root, 4096, 4096, 0, 256); 408 367 if (ret) 409 368 return ret; 410 369 411 - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, 412 - BTRFS_QGROUP_OPER_SUB_SHARED, 0); 370 + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); 413 371 if (ret) { 414 - test_msg("Qgroup record ref failed %d\n", ret); 372 + ulist_free(old_roots); 373 + ulist_free(new_roots); 374 + test_msg("Couldn't find old roots: %d\n", ret); 415 375 return ret; 416 376 } 417 377 418 - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); 378 + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, 379 + old_roots, new_roots); 419 380 if (ret) { 420 - test_msg("Qgroup accounting failed %d\n", ret); 381 + test_msg("Couldn't account space for a qgroup %d\n", ret); 421 382 return ret; 422 383 } 423 384
+59 -20
fs/btrfs/transaction.c
··· 225 225 cur_trans->dirty_bg_run = 0; 226 226 227 227 cur_trans->delayed_refs.href_root = RB_ROOT; 228 + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; 228 229 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 229 230 cur_trans->delayed_refs.num_heads_ready = 0; 230 231 cur_trans->delayed_refs.pending_csums = 0; 231 232 cur_trans->delayed_refs.num_heads = 0; 232 233 cur_trans->delayed_refs.flushing = 0; 233 234 cur_trans->delayed_refs.run_delayed_start = 0; 235 + cur_trans->delayed_refs.qgroup_to_skip = 0; 234 236 235 237 /* 236 238 * although the tree mod log is per file system and not per transaction, ··· 511 509 h->transaction = cur_trans; 512 510 h->blocks_used = 0; 513 511 h->bytes_reserved = 0; 512 + h->chunk_bytes_reserved = 0; 514 513 h->root = root; 515 514 h->delayed_ref_updates = 0; 516 515 h->use_count = 1; ··· 794 791 795 792 if (!list_empty(&trans->new_bgs)) 796 793 btrfs_create_pending_block_groups(trans, root); 794 + 795 + btrfs_trans_release_chunk_metadata(trans); 797 796 798 797 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 799 798 should_end_transaction(trans, root) && ··· 1295 1290 if (pending->error) 1296 1291 goto no_free_objectid; 1297 1292 1293 + /* 1294 + * Make qgroup to skip current new snapshot's qgroupid, as it is 1295 + * accounted by later btrfs_qgroup_inherit(). 1296 + */ 1297 + btrfs_set_skip_qgroup(trans, objectid); 1298 + 1298 1299 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1299 1300 1300 1301 if (to_reserve > 0) { ··· 1309 1298 to_reserve, 1310 1299 BTRFS_RESERVE_NO_FLUSH); 1311 1300 if (pending->error) 1312 - goto no_free_objectid; 1301 + goto clear_skip_qgroup; 1313 1302 } 1314 1303 1315 1304 key.objectid = objectid; ··· 1407 1396 btrfs_abort_transaction(trans, root, ret); 1408 1397 goto fail; 1409 1398 } 1410 - 1411 - /* 1412 - * We need to flush delayed refs in order to make sure all of our quota 1413 - * operations have been done before we call btrfs_qgroup_inherit. 1414 - */ 1415 - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1416 - if (ret) { 1417 - btrfs_abort_transaction(trans, root, ret); 1418 - goto fail; 1419 - } 1420 - 1421 - ret = btrfs_qgroup_inherit(trans, fs_info, 1422 - root->root_key.objectid, 1423 - objectid, pending->inherit); 1424 - if (ret) { 1425 - btrfs_abort_transaction(trans, root, ret); 1426 - goto fail; 1427 - } 1428 - 1429 1399 /* see comments in should_cow_block() */ 1430 1400 set_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1431 1401 smp_wmb(); ··· 1489 1497 goto fail; 1490 1498 } 1491 1499 } 1500 + 1501 + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1502 + if (ret) { 1503 + btrfs_abort_transaction(trans, root, ret); 1504 + goto fail; 1505 + } 1506 + 1507 + /* 1508 + * account qgroup counters before qgroup_inherit() 1509 + */ 1510 + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); 1511 + if (ret) 1512 + goto fail; 1513 + ret = btrfs_qgroup_account_extents(trans, fs_info); 1514 + if (ret) 1515 + goto fail; 1516 + ret = btrfs_qgroup_inherit(trans, fs_info, 1517 + root->root_key.objectid, 1518 + objectid, pending->inherit); 1519 + if (ret) { 1520 + btrfs_abort_transaction(trans, root, ret); 1521 + goto fail; 1522 + } 1523 + 1492 1524 fail: 1493 1525 pending->error = ret; 1494 1526 dir_item_existed: 1495 1527 trans->block_rsv = rsv; 1496 1528 trans->bytes_reserved = 0; 1529 + clear_skip_qgroup: 1530 + btrfs_clear_skip_qgroup(trans); 1497 1531 no_free_objectid: 1498 1532 kfree(new_root_item); 1499 1533 root_item_alloc_fail: ··· 1981 1963 goto scrub_continue; 1982 1964 } 1983 1965 1966 + /* Reocrd old roots for later qgroup accounting */ 1967 + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); 1968 + if (ret) { 1969 + mutex_unlock(&root->fs_info->reloc_mutex); 1970 + goto scrub_continue; 1971 + } 1972 + 1984 1973 /* 1985 1974 * make sure none of the code above managed to slip in a 1986 1975 * delayed item ··· 2028 2003 * safe to free the root of tree log roots 2029 2004 */ 2030 2005 btrfs_free_log_root_tree(trans, root->fs_info); 2006 + 2007 + /* 2008 + * Since fs roots are all committed, we can get a quite accurate 2009 + * new_roots. So let's do quota accounting. 2010 + */ 2011 + ret = btrfs_qgroup_account_extents(trans, root->fs_info); 2012 + if (ret < 0) { 2013 + mutex_unlock(&root->fs_info->tree_log_mutex); 2014 + mutex_unlock(&root->fs_info->reloc_mutex); 2015 + goto scrub_continue; 2016 + } 2031 2017 2032 2018 ret = commit_cowonly_roots(trans, root); 2033 2019 if (ret) { ··· 2089 2053 2090 2054 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); 2091 2055 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); 2056 + 2057 + btrfs_trans_release_chunk_metadata(trans); 2092 2058 2093 2059 spin_lock(&root->fs_info->trans_lock); 2094 2060 cur_trans->state = TRANS_STATE_UNBLOCKED; ··· 2161 2123 btrfs_scrub_continue(root); 2162 2124 cleanup_transaction: 2163 2125 btrfs_trans_release_metadata(trans, root); 2126 + btrfs_trans_release_chunk_metadata(trans); 2164 2127 trans->block_rsv = NULL; 2165 2128 if (trans->qgroup_reserved) { 2166 2129 btrfs_qgroup_free(root, trans->qgroup_reserved);
+24
fs/btrfs/transaction.h
··· 102 102 struct btrfs_trans_handle { 103 103 u64 transid; 104 104 u64 bytes_reserved; 105 + u64 chunk_bytes_reserved; 105 106 u64 qgroup_reserved; 106 107 unsigned long use_count; 107 108 unsigned long blocks_reserved; ··· 152 151 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 153 152 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 154 153 spin_unlock(&BTRFS_I(inode)->lock); 154 + } 155 + 156 + /* 157 + * Make qgroup codes to skip given qgroupid, means the old/new_roots for 158 + * qgroup won't contain the qgroupid in it. 159 + */ 160 + static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, 161 + u64 qgroupid) 162 + { 163 + struct btrfs_delayed_ref_root *delayed_refs; 164 + 165 + delayed_refs = &trans->transaction->delayed_refs; 166 + WARN_ON(delayed_refs->qgroup_to_skip); 167 + delayed_refs->qgroup_to_skip = qgroupid; 168 + } 169 + 170 + static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) 171 + { 172 + struct btrfs_delayed_ref_root *delayed_refs; 173 + 174 + delayed_refs = &trans->transaction->delayed_refs; 175 + WARN_ON(!delayed_refs->qgroup_to_skip); 176 + delayed_refs->qgroup_to_skip = 0; 155 177 } 156 178 157 179 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
-3
fs/btrfs/tree-defrag.c
··· 52 52 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 53 53 goto out; 54 54 55 - if (btrfs_test_opt(root, SSD)) 56 - goto out; 57 - 58 55 path = btrfs_alloc_path(); 59 56 if (!path) 60 57 return -ENOMEM;
-6
fs/btrfs/tree-log.c
··· 3881 3881 &ordered->flags)) 3882 3882 continue; 3883 3883 3884 - if (ordered->csum_bytes_left) { 3885 - btrfs_start_ordered_extent(inode, ordered, 0); 3886 - wait_event(ordered->wait, 3887 - ordered->csum_bytes_left == 0); 3888 - } 3889 - 3890 3884 list_for_each_entry(sum, &ordered->list, list) { 3891 3885 ret = btrfs_csum_file_blocks(trans, log, sum); 3892 3886 if (ret)
+36 -11
fs/btrfs/ulist.c
··· 132 132 return NULL; 133 133 } 134 134 135 + static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) 136 + { 137 + rb_erase(&node->rb_node, &ulist->root); 138 + list_del(&node->list); 139 + kfree(node); 140 + BUG_ON(ulist->nnodes == 0); 141 + ulist->nnodes--; 142 + } 143 + 135 144 static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) 136 145 { 137 146 struct rb_node **p = &ulist->root.rb_node; ··· 206 197 207 198 node->val = val; 208 199 node->aux = aux; 209 - #ifdef CONFIG_BTRFS_DEBUG 210 - node->seqnum = ulist->nnodes; 211 - #endif 212 200 213 201 ret = ulist_rbtree_insert(ulist, node); 214 202 ASSERT(!ret); ··· 213 207 ulist->nnodes++; 214 208 215 209 return 1; 210 + } 211 + 212 + /* 213 + * ulist_del - delete one node from ulist 214 + * @ulist: ulist to remove node from 215 + * @val: value to delete 216 + * @aux: aux to delete 217 + * 218 + * The deletion will only be done when *BOTH* val and aux matches. 219 + * Return 0 for successful delete. 220 + * Return > 0 for not found. 221 + */ 222 + int ulist_del(struct ulist *ulist, u64 val, u64 aux) 223 + { 224 + struct ulist_node *node; 225 + 226 + node = ulist_rbtree_search(ulist, val); 227 + /* Not found */ 228 + if (!node) 229 + return 1; 230 + 231 + if (node->aux != aux) 232 + return 1; 233 + 234 + /* Found and delete */ 235 + ulist_rbtree_erase(ulist, node); 236 + return 0; 216 237 } 217 238 218 239 /** ··· 270 237 uiter->cur_list = uiter->cur_list->next; 271 238 } else { 272 239 uiter->cur_list = ulist->nodes.next; 273 - #ifdef CONFIG_BTRFS_DEBUG 274 - uiter->i = 0; 275 - #endif 276 240 } 277 241 node = list_entry(uiter->cur_list, struct ulist_node, list); 278 - #ifdef CONFIG_BTRFS_DEBUG 279 - ASSERT(node->seqnum == uiter->i); 280 - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); 281 - uiter->i++; 282 - #endif 283 242 return node; 284 243 }
+1
fs/btrfs/ulist.h
··· 57 57 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); 58 58 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 59 59 u64 *old_aux, gfp_t gfp_mask); 60 + int ulist_del(struct ulist *ulist, u64 val, u64 aux); 60 61 61 62 /* just like ulist_add_merge() but take a pointer for the aux data */ 62 63 static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+163 -23
fs/btrfs/volumes.c
··· 52 52 53 53 DEFINE_MUTEX(uuid_mutex); 54 54 static LIST_HEAD(fs_uuids); 55 + struct list_head *btrfs_get_fs_uuids(void) 56 + { 57 + return &fs_uuids; 58 + } 55 59 56 60 static struct btrfs_fs_devices *__alloc_fs_devices(void) 57 61 { ··· 445 441 run_scheduled_bios(device); 446 442 } 447 443 444 + 445 + void btrfs_free_stale_device(struct btrfs_device *cur_dev) 446 + { 447 + struct btrfs_fs_devices *fs_devs; 448 + struct btrfs_device *dev; 449 + 450 + if (!cur_dev->name) 451 + return; 452 + 453 + list_for_each_entry(fs_devs, &fs_uuids, list) { 454 + int del = 1; 455 + 456 + if (fs_devs->opened) 457 + continue; 458 + if (fs_devs->seeding) 459 + continue; 460 + 461 + list_for_each_entry(dev, &fs_devs->devices, dev_list) { 462 + 463 + if (dev == cur_dev) 464 + continue; 465 + if (!dev->name) 466 + continue; 467 + 468 + /* 469 + * Todo: This won't be enough. What if the same device 470 + * comes back (with new uuid and) with its mapper path? 471 + * But for now, this does help as mostly an admin will 472 + * either use mapper or non mapper path throughout. 473 + */ 474 + rcu_read_lock(); 475 + del = strcmp(rcu_str_deref(dev->name), 476 + rcu_str_deref(cur_dev->name)); 477 + rcu_read_unlock(); 478 + if (!del) 479 + break; 480 + } 481 + 482 + if (!del) { 483 + /* delete the stale device */ 484 + if (fs_devs->num_devices == 1) { 485 + btrfs_sysfs_remove_fsid(fs_devs); 486 + list_del(&fs_devs->list); 487 + free_fs_devices(fs_devs); 488 + } else { 489 + fs_devs->num_devices--; 490 + list_del(&dev->dev_list); 491 + rcu_string_free(dev->name); 492 + kfree(dev); 493 + } 494 + break; 495 + } 496 + } 497 + } 498 + 448 499 /* 449 500 * Add new device to list of registered devices 450 501 * ··· 614 555 */ 615 556 if (!fs_devices->opened) 616 557 device->generation = found_transid; 558 + 559 + /* 560 + * if there is new btrfs on an already registered device, 561 + * then remove the stale device entry. 562 + */ 563 + btrfs_free_stale_device(device); 617 564 618 565 *fs_devices_ret = fs_devices; 619 566 ··· 758 693 759 694 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 760 695 { 761 - struct btrfs_device *device; 696 + struct btrfs_device *device, *tmp; 762 697 763 698 if (--fs_devices->opened > 0) 764 699 return 0; 765 700 766 701 mutex_lock(&fs_devices->device_list_mutex); 767 - list_for_each_entry(device, &fs_devices->devices, dev_list) { 702 + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 768 703 struct btrfs_device *new_device; 769 704 struct rcu_string *name; 770 705 ··· 1132 1067 1133 1068 map = (struct map_lookup *)em->bdev; 1134 1069 for (i = 0; i < map->num_stripes; i++) { 1070 + u64 end; 1071 + 1135 1072 if (map->stripes[i].dev != device) 1136 1073 continue; 1137 1074 if (map->stripes[i].physical >= physical_start + len || 1138 1075 map->stripes[i].physical + em->orig_block_len <= 1139 1076 physical_start) 1140 1077 continue; 1141 - *start = map->stripes[i].physical + 1142 - em->orig_block_len; 1143 - ret = 1; 1078 + /* 1079 + * Make sure that while processing the pinned list we do 1080 + * not override our *start with a lower value, because 1081 + * we can have pinned chunks that fall within this 1082 + * device hole and that have lower physical addresses 1083 + * than the pending chunks we processed before. If we 1084 + * do not take this special care we can end up getting 1085 + * 2 pending chunks that start at the same physical 1086 + * device offsets because the end offset of a pinned 1087 + * chunk can be equal to the start offset of some 1088 + * pending chunk. 1089 + */ 1090 + end = map->stripes[i].physical + em->orig_block_len; 1091 + if (end > *start) { 1092 + *start = end; 1093 + ret = 1; 1094 + } 1144 1095 } 1145 1096 } 1146 1097 if (search_list == &trans->transaction->pending_chunks) { ··· 1787 1706 if (device->bdev) { 1788 1707 device->fs_devices->open_devices--; 1789 1708 /* remove sysfs entry */ 1790 - btrfs_kobj_rm_device(root->fs_info, device); 1709 + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 1791 1710 } 1792 1711 1793 1712 call_rcu(&device->rcu, free_device); ··· 1956 1875 mutex_lock(&uuid_mutex); 1957 1876 WARN_ON(!tgtdev); 1958 1877 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1878 + 1879 + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); 1880 + 1959 1881 if (tgtdev->bdev) { 1960 1882 btrfs_scratch_superblock(tgtdev); 1961 1883 fs_info->fs_devices->open_devices--; ··· 2295 2211 tmp + 1); 2296 2212 2297 2213 /* add sysfs device entry */ 2298 - btrfs_kobj_add_device(root->fs_info, device); 2214 + btrfs_kobj_add_device(root->fs_info->fs_devices, device); 2299 2215 2300 2216 /* 2301 2217 * we've got more storage, clear any full flags on the space ··· 2336 2252 */ 2337 2253 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2338 2254 root->fs_info->fsid); 2339 - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2340 - goto error_trans; 2255 + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, 2256 + fsid_buf)) 2257 + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); 2341 2258 } 2342 2259 2343 2260 root->fs_info->num_tolerated_disk_barrier_failures = ··· 2374 2289 error_trans: 2375 2290 btrfs_end_transaction(trans, root); 2376 2291 rcu_string_free(device->name); 2377 - btrfs_kobj_rm_device(root->fs_info, device); 2292 + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 2378 2293 kfree(device); 2379 2294 error: 2380 2295 blkdev_put(bdev, FMODE_EXCL); ··· 2694 2609 return -EINVAL; 2695 2610 } 2696 2611 map = (struct map_lookup *)em->bdev; 2612 + lock_chunks(root->fs_info->chunk_root); 2613 + check_system_chunk(trans, extent_root, map->type); 2614 + unlock_chunks(root->fs_info->chunk_root); 2697 2615 2698 2616 for (i = 0; i < map->num_stripes; i++) { 2699 2617 struct btrfs_device *device = map->stripes[i].dev; ··· 3996 3908 uuid_root = btrfs_create_tree(trans, fs_info, 3997 3909 BTRFS_UUID_TREE_OBJECTID); 3998 3910 if (IS_ERR(uuid_root)) { 3999 - btrfs_abort_transaction(trans, tree_root, 4000 - PTR_ERR(uuid_root)); 4001 - return PTR_ERR(uuid_root); 3911 + ret = PTR_ERR(uuid_root); 3912 + btrfs_abort_transaction(trans, tree_root, ret); 3913 + return ret; 4002 3914 } 4003 3915 4004 3916 fs_info->uuid_root = uuid_root; ··· 4053 3965 int slot; 4054 3966 int failed = 0; 4055 3967 bool retried = false; 3968 + bool checked_pending_chunks = false; 4056 3969 struct extent_buffer *l; 4057 3970 struct btrfs_key key; 4058 3971 struct btrfs_super_block *super_copy = root->fs_info->super_copy; ··· 4134 4045 goto again; 4135 4046 } else if (failed && retried) { 4136 4047 ret = -ENOSPC; 4137 - lock_chunks(root); 4138 - 4139 - btrfs_device_set_total_bytes(device, old_size); 4140 - if (device->writeable) 4141 - device->fs_devices->total_rw_bytes += diff; 4142 - spin_lock(&root->fs_info->free_chunk_lock); 4143 - root->fs_info->free_chunk_space += diff; 4144 - spin_unlock(&root->fs_info->free_chunk_lock); 4145 - unlock_chunks(root); 4146 4048 goto done; 4147 4049 } 4148 4050 ··· 4145 4065 } 4146 4066 4147 4067 lock_chunks(root); 4068 + 4069 + /* 4070 + * We checked in the above loop all device extents that were already in 4071 + * the device tree. However before we have updated the device's 4072 + * total_bytes to the new size, we might have had chunk allocations that 4073 + * have not complete yet (new block groups attached to transaction 4074 + * handles), and therefore their device extents were not yet in the 4075 + * device tree and we missed them in the loop above. So if we have any 4076 + * pending chunk using a device extent that overlaps the device range 4077 + * that we can not use anymore, commit the current transaction and 4078 + * repeat the search on the device tree - this way we guarantee we will 4079 + * not have chunks using device extents that end beyond 'new_size'. 4080 + */ 4081 + if (!checked_pending_chunks) { 4082 + u64 start = new_size; 4083 + u64 len = old_size - new_size; 4084 + 4085 + if (contains_pending_extent(trans, device, &start, len)) { 4086 + unlock_chunks(root); 4087 + checked_pending_chunks = true; 4088 + failed = 0; 4089 + retried = false; 4090 + ret = btrfs_commit_transaction(trans, root); 4091 + if (ret) 4092 + goto done; 4093 + goto again; 4094 + } 4095 + } 4096 + 4148 4097 btrfs_device_set_disk_total_bytes(device, new_size); 4149 4098 if (list_empty(&device->resized_list)) 4150 4099 list_add_tail(&device->resized_list, ··· 4188 4079 btrfs_end_transaction(trans, root); 4189 4080 done: 4190 4081 btrfs_free_path(path); 4082 + if (ret) { 4083 + lock_chunks(root); 4084 + btrfs_device_set_total_bytes(device, old_size); 4085 + if (device->writeable) 4086 + device->fs_devices->total_rw_bytes += diff; 4087 + spin_lock(&root->fs_info->free_chunk_lock); 4088 + root->fs_info->free_chunk_space += diff; 4089 + spin_unlock(&root->fs_info->free_chunk_lock); 4090 + unlock_chunks(root); 4091 + } 4191 4092 return ret; 4192 4093 } 4193 4094 ··· 6191 6072 free_extent_map(em); 6192 6073 return -EIO; 6193 6074 } 6075 + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", 6076 + devid, uuid); 6194 6077 } 6195 6078 map->stripes[i].dev->in_fs_metadata = 1; 6196 6079 } ··· 6312 6191 if (!btrfs_test_opt(root, DEGRADED)) 6313 6192 return -EIO; 6314 6193 6315 - btrfs_warn(root->fs_info, "devid %llu missing", devid); 6316 6194 device = add_missing_dev(root, fs_devices, devid, dev_uuid); 6317 6195 if (!device) 6318 6196 return -ENOMEM; 6197 + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", 6198 + devid, dev_uuid); 6319 6199 } else { 6320 6200 if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) 6321 6201 return -EIO; ··· 6843 6721 } 6844 6722 } 6845 6723 unlock_chunks(root); 6724 + } 6725 + 6726 + void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 6727 + { 6728 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6729 + while (fs_devices) { 6730 + fs_devices->fs_info = fs_info; 6731 + fs_devices = fs_devices->seed; 6732 + } 6733 + } 6734 + 6735 + void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 6736 + { 6737 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6738 + while (fs_devices) { 6739 + fs_devices->fs_info = NULL; 6740 + fs_devices = fs_devices->seed; 6741 + } 6846 6742 }
+9
fs/btrfs/volumes.h
··· 253 253 * nonrot flag set 254 254 */ 255 255 int rotating; 256 + 257 + struct btrfs_fs_info *fs_info; 258 + /* sysfs kobjects */ 259 + struct kobject super_kobj; 260 + struct kobject *device_dir_kobj; 261 + struct completion kobj_unregister; 256 262 }; 257 263 258 264 #define BTRFS_BIO_INLINE_CSUM_SIZE 64 ··· 541 535 mutex_unlock(&root->fs_info->chunk_mutex); 542 536 } 543 537 538 + struct list_head *btrfs_get_fs_uuids(void); 539 + void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); 540 + void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); 544 541 545 542 #endif
+1
fs/seq_file.c
··· 538 538 539 539 return res; 540 540 } 541 + EXPORT_SYMBOL(seq_dentry); 541 542 542 543 static void *single_start(struct seq_file *p, loff_t *pos) 543 544 {
-55
include/trace/events/btrfs.h
··· 1117 1117 TP_ARGS(wq) 1118 1118 ); 1119 1119 1120 - #define show_oper_type(type) \ 1121 - __print_symbolic(type, \ 1122 - { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \ 1123 - { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \ 1124 - { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \ 1125 - { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" }) 1126 - 1127 - DECLARE_EVENT_CLASS(btrfs_qgroup_oper, 1128 - 1129 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1130 - 1131 - TP_ARGS(oper), 1132 - 1133 - TP_STRUCT__entry( 1134 - __field( u64, ref_root ) 1135 - __field( u64, bytenr ) 1136 - __field( u64, num_bytes ) 1137 - __field( u64, seq ) 1138 - __field( int, type ) 1139 - __field( u64, elem_seq ) 1140 - ), 1141 - 1142 - TP_fast_assign( 1143 - __entry->ref_root = oper->ref_root; 1144 - __entry->bytenr = oper->bytenr, 1145 - __entry->num_bytes = oper->num_bytes; 1146 - __entry->seq = oper->seq; 1147 - __entry->type = oper->type; 1148 - __entry->elem_seq = oper->elem.seq; 1149 - ), 1150 - 1151 - TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, " 1152 - "seq = %llu, elem.seq = %llu, type = %s", 1153 - (unsigned long long)__entry->ref_root, 1154 - (unsigned long long)__entry->bytenr, 1155 - (unsigned long long)__entry->num_bytes, 1156 - (unsigned long long)__entry->seq, 1157 - (unsigned long long)__entry->elem_seq, 1158 - show_oper_type(__entry->type)) 1159 - ); 1160 - 1161 - DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account, 1162 - 1163 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1164 - 1165 - TP_ARGS(oper) 1166 - ); 1167 - 1168 - DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref, 1169 - 1170 - TP_PROTO(struct btrfs_qgroup_operation *oper), 1171 - 1172 - TP_ARGS(oper) 1173 - ); 1174 - 1175 1120 #endif /* _TRACE_BTRFS_H */ 1176 1121 1177 1122 /* This part must be outside protection */
+1
lib/kobject.c
··· 545 545 kfree(devpath); 546 546 return error; 547 547 } 548 + EXPORT_SYMBOL_GPL(kobject_move); 548 549 549 550 /** 550 551 * kobject_del - unlink kobject from hierarchy.